Fix a bug in Waymo conversion: GPU should be disable (#64)

* Update waymo.rst * Update waymo.rst * allow generate all data * update readme * update * better logging info * more info * up * fix * add note on GPU * better log * format
2024-02-20 13:28:09 -08:00
parent 06c3aee0e2
commit 6cda061ed8
8 changed files with 47 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ pip install -e.

 # Install ScenarioNet
 cd ~/  # Go to the folder you want to host these two repos.
-git clone git@github.com:metadriverse/scenarionet.git
+git clone https://github.com/metadriverse/scenarionet.git
 cd scenarionet
 pip install -e .
 ```
--- a/documentation/example.rst
+++ b/documentation/example.rst
@@ -38,7 +38,7 @@ For Waymo data, we already have the parser in ScenarioNet so just install the Te
    conda install protobuf==3.20

 .. note::
-    You may fail to install ``protobuf`` if using ``pip install protobuf==3.20``.
+    You may fail to install ``protobuf`` if using ``pip install protobuf==3.20``. If so, install via ``conda install protobuf=3.20``.

 For other datasets like nuPlan and nuScenes, you need to setup `nuplan-devkit <https://github.com/motional/nuplan-devkit>`_ and `nuscenes-devkit <https://github.com/nutonomy/nuscenes-devkit>`_ respectively.
 Guidance on how to setup these datasets and connect them with ScenarioNet can be found at :ref:`datasets`.
--- a/documentation/waymo.rst
+++ b/documentation/waymo.rst
@@ -35,7 +35,7 @@ First of all, we have to install tensorflow and Protobuf::
    conda install protobuf==3.20

 .. note::
-    You may fail to install ``protobuf`` if using ``pip install protobuf==3.20``.
+    You may fail to install ``protobuf`` if using ``pip install protobuf==3.20``. If so, install via ``conda install protobuf=3.20``.


 2. Download TFRecord
@@ -79,12 +79,18 @@ The downloaded data should be stored in a directory like this::
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Run the following command to extract scenarios in any directory containing ``tfrecord``.
+
+
 Here we take converting raw data in ``training_20s`` as an example::

-    python -m scenarionet.convert_waymo -d /path/to/your/database --raw_data_path ./waymo/training_20s --num_files=1000
+    python -m scenarionet.convert_waymo -d /path/to/your/database --raw_data_path ./waymo/training_20s --num_workers 64

 Now all converted scenarios will be placed at ``/path/to/your/database`` and are ready to be used in your work.

+.. note::
+    When running the conversion, please double check whether GPU is being used. This converter should NOT use GPU.
+    We have disable GPU usage by ``os.environ["CUDA_VISIBLE_DEVICES"] = ""``.
+
 Known Issues: Waymo
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 N/A
--- a/scenarionet/common_utils.py
+++ b/scenarionet/common_utils.py
@@ -1,9 +1,12 @@
+import logging
 import os.path
 import pickle

 import numpy as np
 from metadrive.scenario import utils as sd_utils

+logger = logging.getLogger(__file__)
+

 def recursive_equal(data1, data2, need_assert=False):
    from metadrive.utils.config import Config
@@ -71,7 +74,7 @@ def save_summary_and_mapping(summary_file_path, mapping_file_path, summary, mapp
        pickle.dump(dict_recursive_remove_array_and_set(summary), file)
    with open(mapping_file_path, "wb") as file:
        pickle.dump(mapping, file)
-    print(
+    logging.info(
        "\n ================ Dataset Summary and Mapping are saved at: {} "
        "================ \n".format(summary_file_path)
    )
--- a/scenarionet/convert_pg.py
+++ b/scenarionet/convert_pg.py
@@ -4,6 +4,7 @@ if __name__ == '__main__':
    import pkg_resources  # for suppress warning
    import argparse
    import os.path
+    import os

    import metadrive

@@ -11,6 +12,8 @@ if __name__ == '__main__':
    from scenarionet.converter.pg.utils import get_pg_scenarios, convert_pg_scenario
    from scenarionet.converter.utils import write_to_directory

+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
    # For the PG environment config, see: scenarionet/converter/pg/utils.py:6
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument(
--- a/scenarionet/convert_waymo.py
+++ b/scenarionet/convert_waymo.py
@@ -11,6 +11,8 @@ if __name__ == '__main__':
    from scenarionet.converter.utils import write_to_directory
    from scenarionet.converter.waymo.utils import convert_waymo_scenario, get_waymo_scenarios, preprocess_waymo_scenarios

+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(description=desc)
@@ -36,14 +38,14 @@ if __name__ == '__main__':
        default=0,
        type=int,
        help="Control how many files to use. We will list all files in the raw data folder "
-        "and select files[start_file_index: start_file_index+num_files]"
+        "and select files[start_file_index: start_file_index+num_files]. Default: 0."
    )
    parser.add_argument(
        "--num_files",
-        default=1000,
+        default=None,
        type=int,
        help="Control how many files to use. We will list all files in the raw data folder "
-        "and select files[start_file_index: start_file_index+num_files]"
+        "and select files[start_file_index: start_file_index+num_files]. Default: None, will read all files."
    )
    args = parser.parse_args()

@@ -65,6 +67,12 @@ if __name__ == '__main__':
    waymo_data_directory = os.path.join(SCENARIONET_DATASET_PATH, args.raw_data_path)
    files = get_waymo_scenarios(waymo_data_directory, args.start_file_index, args.num_files)

+    logger.info(
+        f"We will read {len(files)} raw files. You set the number of workers to {args.num_workers}. "
+        f"Please make sure there will not be too much files to be read in each worker "
+        f"(now it's {len(files) / args.num_workers})!"
+    )
+
    write_to_directory(
        convert_func=convert_waymo_scenario,
        scenarios=files,
--- a/scenarionet/converter/utils.py
+++ b/scenarionet/converter/utils.py
@@ -218,7 +218,8 @@ def write_to_directory_single_worker(
        kwargs["env"] = make_env(start_index=scenarios[0], num_scenarios=len(scenarios))

    count = 0
-    for scenario in tqdm.tqdm(scenarios, desc="Worker Index: {}".format(worker_index)):
+    # for scenario in tqdm.tqdm(scenarios, position=2, leave=True, desc=f"Worker {worker_index} Number of scenarios"):
+    for scenario in scenarios:
        # convert scenario
        sd_scenario = convert_func(scenario, dataset_version, **kwargs)
        scenario_id = sd_scenario[SD.ID]
@@ -248,6 +249,9 @@ def write_to_directory_single_worker(
            print("Current Memory: {}".format(process_memory()))
        count += 1

+        if count % 500 == 0:
+            logger.info(f"Worker {worker_index} has processed {count} scenarios.")
+
    # store summary file
    save_summary_and_mapping(summary_file_path, mapping_file_path, summary, mapping)

@@ -257,6 +261,8 @@ def write_to_directory_single_worker(
        shutil.rmtree(delay_remove)
    os.rename(output_path, save_path)

+    logger.info(f"Worker {worker_index} finished! Files are saved at: {save_path}")
+

 def process_memory():
    process = psutil.Process(os.getpid())
--- a/scenarionet/converter/waymo/utils.py
+++ b/scenarionet/converter/waymo/utils.py
@@ -430,6 +430,11 @@ def get_waymo_scenarios(waymo_data_directory, start_index, num):
    # there is 1000 raw data in google cloud, each of them produce about 500 pkl file
    logger.info("\nReading raw data")
    file_list = os.listdir(waymo_data_directory)
+    if num is None:
+        logger.warning(
+            "You haven't specified the number of raw files! It is set to {} now.".format(len(file_list) - start_index)
+        )
+        num = len(file_list) - start_index
    assert len(file_list) >= start_index + num and start_index >= 0, \
        "No sufficient files ({}) in raw_data_directory. need: {}, start: {}".format(len(file_list), num, start_index)
    file_list = file_list[start_index:start_index + num]
@@ -448,9 +453,13 @@ def preprocess_waymo_scenarios(files, worker_index):
    """
    from scenarionet.converter.waymo.waymo_protos import scenario_pb2

-    for file in tqdm.tqdm(files, desc="Process Waymo scenarios for worker {}".format(worker_index)):
+    for file in tqdm.tqdm(files, leave=False, position=0, desc="Worker {} Number of raw file".format(worker_index)):
+
+        logger.info(f"Worker {worker_index} is reading raw file: {file}")
+
        file_path = os.path.join(file)
        if ("tfrecord" not in file_path) or (not os.path.isfile(file_path)):
+            logger.info(f"Worker {worker_index} skip this file: {file}")
            continue
        for data in tf.data.TFRecordDataset(file_path, compression_type="").as_numpy_iterator():
            scenario = scenario_pb2.Scenario()
@@ -458,5 +467,7 @@ def preprocess_waymo_scenarios(files, worker_index):
            # a trick for loging file name
            scenario.scenario_id = scenario.scenario_id + SPLIT_KEY + file
            yield scenario
+
+    logger.info(f"Worker {worker_index} finished read {len(files)} files.")
    # logger.info("Worker {}: Process {} waymo scenarios".format(worker_index, len(scenarios)))
    # return scenarios