Spaces:

BestWJH
/

PEAR

Build error

App Files Files Community

PEAR / pytorch3d /implicitron /dataset /sql_dataset_provider.py

BestWJH

Upload 455 files

94dc344 verified 2 months ago

raw

history blame contribute delete

16.8 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	# pyre-unsafe


	import logging
	import os
	from typing import List, Optional, Tuple, Type

	import numpy as np

	from omegaconf import DictConfig, OmegaConf

	from pytorch3d.implicitron.dataset.dataset_map_provider import (
	DatasetMap,
	DatasetMapProviderBase,
	PathManagerFactory,
	)
	from pytorch3d.implicitron.tools.config import (
	expand_args_fields,
	registry,
	run_auto_creation,
	)

	from .sql_dataset import SqlIndexDataset


	_CO3D_SQL_DATASET_ROOT: str = os.getenv("CO3D_SQL_DATASET_ROOT", "")

	# _NEED_CONTROL is a list of those elements of SqlIndexDataset which
	# are not directly specified for it in the config but come from the
	# DatasetMapProvider.
	_NEED_CONTROL: Tuple[str, ...] = (
	"path_manager",
	"subsets",
	"sqlite_metadata_file",
	"subset_lists_file",
	)

	logger = logging.getLogger(__name__)


	@registry.register
	class SqlIndexDatasetMapProvider(DatasetMapProviderBase):
	"""
	Generates the training, validation, and testing dataset objects for
	a dataset laid out on disk like SQL-CO3D, with annotations in an SQLite data base.

	The dataset is organized in the filesystem as follows::

	self.dataset_root
	├── <possible/partition/0>
	│ ├── <sequence_name_0>
	│ │ ├── depth_masks
	│ │ ├── depths
	│ │ ├── images
	│ │ ├── masks
	│ │ └── pointcloud.ply
	│ ├── <sequence_name_1>
	│ │ ├── depth_masks
	│ │ ├── depths
	│ │ ├── images
	│ │ ├── masks
	│ │ └── pointcloud.ply
	│ ├── ...
	│ ├── <sequence_name_N>
	│ ├── set_lists
	│ ├── <subset_base_name_0>.json
	│ ├── <subset_base_name_1>.json
	│ ├── ...
	│ ├── <subset_base_name_2>.json
	│ ├── eval_batches
	│ │ ├── <eval_batches_base_name_0>.json
	│ │ ├── <eval_batches_base_name_1>.json
	│ │ ├── ...
	│ │ ├── <eval_batches_base_name_M>.json
	│ ├── frame_annotations.jgz
	│ ├── sequence_annotations.jgz
	├── <possible/partition/1>
	├── ...
	├── <possible/partition/K>
	├── set_lists
	├── <subset_base_name_0>.sqlite
	├── <subset_base_name_1>.sqlite
	├── ...
	├── <subset_base_name_2>.sqlite
	├── eval_batches
	│ ├── <eval_batches_base_name_0>.json
	│ ├── <eval_batches_base_name_1>.json
	│ ├── ...
	│ ├── <eval_batches_base_name_M>.json

	The dataset contains sequences named `<sequence_name_i>` that may be partitioned by
	directories such as `<possible/partition/0>` e.g. representing categories but they
	can also be stored in a flat structure. Each sequence folder contains the list of
	sequence images, depth maps, foreground masks, and valid-depth masks
	`images`, `depths`, `masks`, and `depth_masks` respectively. Furthermore,
	`set_lists/` dirtectories (with partitions or global) store json or sqlite files
	`<subset_base_name_l>.<ext>`, each describing a certain sequence subset.
	These subset path conventions are not hard-coded and arbitrary relative path can be
	specified by setting `self.subset_lists_path` to the relative path w.r.t.
	dataset root.

	Each `<subset_base_name_l>.json` file contains the following dictionary::

	{
	"train": [
	(sequence_name: str, frame_number: int, image_path: str),
	...
	],
	"val": [
	(sequence_name: str, frame_number: int, image_path: str),
	...
	],
	"test": [
	(sequence_name: str, frame_number: int, image_path: str),
	...
	],
	]

	defining the list of frames (identified with their `sequence_name` and
	`frame_number`) in the "train", "val", and "test" subsets of the dataset. In case of
	SQLite format, `<subset_base_name_l>.sqlite` contains a table with the header::

	\| sequence_name \| frame_number \| image_path \| subset \|

	Note that `frame_number` can be obtained only from the metadata and
	does not necesarrily correspond to the numeric suffix of the corresponding image
	file name (e.g. a file `<partition_0>/<sequence_name_0>/images/frame00005.jpg` can
	have its frame number set to `20`, not 5).

	Each `<eval_batches_base_name_M>.json` file contains a list of evaluation examples
	in the following form::

	[
	[ # batch 1
	(sequence_name: str, frame_number: int, image_path: str),
	...
	],
	[ # batch 2
	(sequence_name: str, frame_number: int, image_path: str),
	...
	],
	]

	Note that the evaluation examples always come from the `"test"` subset of the dataset.
	(test frames can repeat across batches). The batches can contain single element,
	which is typical in case of regular radiance field fitting.

	Args:
	subset_lists_path: The relative path to the dataset subset definition.
	For CO3D, these include e.g. "skateboard/set_lists/set_lists_manyview_dev_0.json".
	By default (None), dataset is not partitioned to subsets (in that case, setting
	`ignore_subsets` will speed up construction)
	dataset_root: The root folder of the dataset.
	metadata_basename: name of the SQL metadata file in dataset_root;
	not expected to be changed by users
	test_on_train: Construct validation and test datasets from
	the training subset; note that in practice, in this
	case all subset dataset objects will be same
	only_test_set: Load only the test set. Incompatible with `test_on_train`.
	ignore_subsets: Don’t filter by subsets in the dataset; note that in this
	case all subset datasets will be same
	eval_batch_num_training_frames: Add a certain number of training frames to each
	eval batch. Useful for evaluating models that require
	source views as input (e.g. NeRF-WCE / PixelNeRF).
	dataset_args: Specifies additional arguments to the
	JsonIndexDataset constructor call.
	path_manager_factory: (Optional) An object that generates an instance of
	PathManager that can translate provided file paths.
	path_manager_factory_class_type: The class type of `path_manager_factory`.
	"""

	category: Optional[str] = None
	subset_list_name: Optional[str] = None # TODO: docs
	# OR
	subset_lists_path: Optional[str] = None
	eval_batches_path: Optional[str] = None

	dataset_root: str = _CO3D_SQL_DATASET_ROOT
	metadata_basename: str = "metadata.sqlite"

	test_on_train: bool = False
	only_test_set: bool = False
	ignore_subsets: bool = False
	train_subsets: Tuple[str, ...] = ("train",)
	val_subsets: Tuple[str, ...] = ("val",)
	test_subsets: Tuple[str, ...] = ("test",)

	eval_batch_num_training_frames: int = 0

	# this is a mould that is never constructed, used to build self._dataset_map values
	dataset_class_type: str = "SqlIndexDataset"
	dataset: SqlIndexDataset # pyre-ignore [13]

	path_manager_factory: PathManagerFactory # pyre-ignore [13]
	path_manager_factory_class_type: str = "PathManagerFactory"

	def __post_init__(self):
	super().__init__()
	run_auto_creation(self)

	if self.only_test_set and self.test_on_train:
	raise ValueError("Cannot have only_test_set and test_on_train")

	if self.ignore_subsets and not self.only_test_set:
	self.test_on_train = True # no point in loading same data 3 times

	path_manager = self.path_manager_factory.get()

	sqlite_metadata_file = os.path.join(self.dataset_root, self.metadata_basename)
	sqlite_metadata_file = _local_path(path_manager, sqlite_metadata_file)

	if not os.path.isfile(sqlite_metadata_file):
	# The sqlite_metadata_file does not exist.
	# Most probably the user has not specified the root folder.
	raise ValueError(
	f"Looking for frame annotations in {sqlite_metadata_file}."
	+ " Please specify a correct dataset_root folder."
	+ " Note: By default the root folder is taken from the"
	+ " CO3D_SQL_DATASET_ROOT environment variable."
	)

	if self.subset_lists_path and self.subset_list_name:
	raise ValueError(
	"subset_lists_path and subset_list_name cannot be both set"
	)

	subset_lists_file = self._get_lists_file("set_lists")

	# setup the common dataset arguments
	common_dataset_kwargs = {
	**getattr(self, f"dataset_{self.dataset_class_type}_args"),
	"sqlite_metadata_file": sqlite_metadata_file,
	"dataset_root": self.dataset_root,
	"subset_lists_file": subset_lists_file,
	"path_manager": path_manager,
	}

	if self.category:
	logger.info(f"Forcing category filter in the datasets to {self.category}")
	common_dataset_kwargs["pick_categories"] = self.category.split(",")

	# get the used dataset type
	dataset_type: Type[SqlIndexDataset] = registry.get(
	SqlIndexDataset, self.dataset_class_type
	)
	expand_args_fields(dataset_type)

	if subset_lists_file is not None and not os.path.isfile(subset_lists_file):
	available_subsets = self._get_available_subsets(
	OmegaConf.to_object(common_dataset_kwargs["pick_categories"])
	)
	msg = f"Cannot find subset list file {self.subset_lists_path}."
	if available_subsets:
	msg += f" Some of the available subsets: {str(available_subsets)}."
	raise ValueError(msg)

	train_dataset = None
	val_dataset = None
	if not self.only_test_set:
	# load the training set
	logger.debug("Constructing train dataset.")
	train_dataset = dataset_type(
	**common_dataset_kwargs, subsets=self._get_subsets(self.train_subsets)
	)
	logger.info(f"Train dataset: {str(train_dataset)}")

	if self.test_on_train:
	assert train_dataset is not None
	val_dataset = test_dataset = train_dataset
	else:
	# load the val and test sets
	if not self.only_test_set:
	# NOTE: this is always loaded in JsonProviderV2
	logger.debug("Extracting val dataset.")
	val_dataset = dataset_type(
	**common_dataset_kwargs, subsets=self._get_subsets(self.val_subsets)
	)
	logger.info(f"Val dataset: {str(val_dataset)}")

	logger.debug("Extracting test dataset.")
	if self.eval_batches_path is None:
	eval_batches_file = None
	else:
	eval_batches_file = self._get_lists_file("eval_batches")

	if "eval_batches_file" in common_dataset_kwargs:
	common_dataset_kwargs.pop("eval_batches_file", None)

	test_dataset = dataset_type(
	**common_dataset_kwargs,
	subsets=self._get_subsets(self.test_subsets, True),
	eval_batches_file=eval_batches_file,
	)
	logger.info(f"Test dataset: {str(test_dataset)}")

	if (
	eval_batches_file is not None
	and self.eval_batch_num_training_frames > 0
	):
	self._extend_eval_batches(test_dataset)

	self._dataset_map = DatasetMap(
	train=train_dataset, val=val_dataset, test=test_dataset
	)

	def _get_subsets(self, subsets, is_eval: bool = False):
	if self.ignore_subsets:
	return None

	if is_eval and self.eval_batch_num_training_frames > 0:
	# we will need to have training frames for extended batches
	return list(subsets) + list(self.train_subsets)

	return subsets

	def _extend_eval_batches(self, test_dataset: SqlIndexDataset) -> None:
	rng = np.random.default_rng(seed=0)
	eval_batches = test_dataset.get_eval_batches()
	if eval_batches is None:
	raise ValueError("Eval batches were not loaded!")

	for batch in eval_batches:
	sequence = batch[0][0]
	seq_frames = list(
	test_dataset.sequence_frames_in_order(sequence, self.train_subsets)
	)
	idx_to_add = rng.permutation(len(seq_frames))[
	: self.eval_batch_num_training_frames
	]
	batch.extend((sequence, seq_frames[a][1]) for a in idx_to_add)

	@classmethod
	def dataset_tweak_args(cls, type, args: DictConfig) -> None:
	"""
	Called by get_default_args.
	Certain fields are not exposed on each dataset class
	but rather are controlled by this provider class.
	"""
	for key in _NEED_CONTROL:
	del args[key]

	def create_dataset(self):
	# No `dataset` member of this class is created.
	# The dataset(s) live in `self.get_dataset_map`.
	pass

	def get_dataset_map(self) -> DatasetMap:
	return self._dataset_map # pyre-ignore [16]

	def _get_available_subsets(self, categories: List[str]):
	"""
	Get the available subset names for a given category folder (if given) inside
	a root dataset folder `dataset_root`.
	"""
	path_manager = self.path_manager_factory.get()

	subsets: List[str] = []
	for prefix in [""] + categories:
	set_list_dir = os.path.join(self.dataset_root, prefix, "set_lists")
	if not (
	(path_manager is not None) and path_manager.isdir(set_list_dir)
	) and not os.path.isdir(set_list_dir):
	continue

	set_list_files = (os.listdir if path_manager is None else path_manager.ls)(
	set_list_dir
	)
	subsets.extend(os.path.join(prefix, "set_lists", f) for f in set_list_files)

	return subsets

	def _get_lists_file(self, flavor: str) -> Optional[str]:
	if flavor == "eval_batches":
	subset_lists_path = self.eval_batches_path
	else:
	subset_lists_path = self.subset_lists_path

	if not subset_lists_path and not self.subset_list_name:
	return None

	category_elem = ""
	if self.category and "," not in self.category:
	# if multiple categories are given, looking for global set lists
	category_elem = self.category

	subset_lists_path = subset_lists_path or (
	os.path.join(
	category_elem, f"{flavor}", f"{flavor}_{self.subset_list_name}"
	)
	)

	assert subset_lists_path
	path_manager = self.path_manager_factory.get()
	# try absolute path first
	subset_lists_file = _get_local_path_check_extensions(
	subset_lists_path, path_manager
	)
	if subset_lists_file:
	return subset_lists_file

	full_path = os.path.join(self.dataset_root, subset_lists_path)
	subset_lists_file = _get_local_path_check_extensions(full_path, path_manager)

	if not subset_lists_file:
	raise FileNotFoundError(
	f"Subset lists path given but not found: {full_path}"
	)

	return subset_lists_file


	def _get_local_path_check_extensions(
	path, path_manager, extensions=("", ".sqlite", ".json")
	) -> Optional[str]:
	for ext in extensions:
	local = _local_path(path_manager, path + ext)
	if os.path.isfile(local):
	return local

	return None


	def _local_path(path_manager, path: str) -> str:
	if path_manager is None:
	return path
	return path_manager.get_local_path(path)