Upload folder using huggingface_hub

8193465 verified 4 months ago

12.6 kB

	import itertools
	from dataclasses import dataclass, field
	from typing import TYPE_CHECKING, Optional

	import numpy as np
	import pyarrow as pa

	import datasets
	from datasets.features.features import (
	Array2D,
	Array3D,
	Array4D,
	Array5D,
	Features,
	LargeList,
	List,
	Value,
	_ArrayXD,
	_arrow_to_datasets_dtype,
	)
	from datasets.table import cast_table_to_features


	if TYPE_CHECKING:
	import h5py

	logger = datasets.utils.logging.get_logger(__name__)

	EXTENSIONS = [".h5", ".hdf5"]


	@dataclass
	class HDF5Config(datasets.BuilderConfig):
	"""BuilderConfig for HDF5."""

	batch_size: Optional[int] = None
	features: Optional[datasets.Features] = None


	class HDF5(datasets.ArrowBasedBuilder):
	"""ArrowBasedBuilder that converts HDF5 files to Arrow tables using the HF extension types."""

	BUILDER_CONFIG_CLASS = HDF5Config

	def _info(self):
	return datasets.DatasetInfo(features=self.config.features)

	def _split_generators(self, dl_manager):
	import h5py

	if not self.config.data_files:
	raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
	dl_manager.download_config.extract_on_the_fly = True
	data_files = dl_manager.download_and_extract(self.config.data_files)
	splits = []
	for split_name, files in data_files.items():
	if isinstance(files, str):
	files = [files]

	files = [dl_manager.iter_files(file) for file in files]
	# Infer features from first file
	if self.info.features is None:
	for first_file in itertools.chain.from_iterable(files):
	with h5py.File(first_file, "r") as h5:
	self.info.features = _recursive_infer_features(h5)
	break
	splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
	return splits

	def _generate_tables(self, files):
	import h5py

	batch_size_cfg = self.config.batch_size
	for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
	try:
	with h5py.File(file, "r") as h5:
	# Infer features and lengths from first file
	if self.info.features is None:
	self.info.features = _recursive_infer_features(h5)
	num_rows = _check_dataset_lengths(h5, self.info.features)
	if num_rows is None:
	logger.warning(f"File {file} contains no data, skipping...")
	continue
	effective_batch = batch_size_cfg or self._writer_batch_size or num_rows
	for start in range(0, num_rows, effective_batch):
	end = min(start + effective_batch, num_rows)
	pa_table = _recursive_load_arrays(h5, self.info.features, start, end)
	if pa_table is None:
	logger.warning(f"File {file} contains no data, skipping...")
	continue
	yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features)
	except ValueError as e:
	logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
	raise


	# ┌───────────┐
	# │ Complex │
	# └───────────┘


	def _is_complex_dtype(dtype: np.dtype) -> bool:
	if dtype.kind == "c":
	return True
	if dtype.subdtype is not None:
	return _is_complex_dtype(dtype.subdtype[0])
	return False


	def _create_complex_features(dset) -> Features:
	if dset.dtype.subdtype is not None:
	dtype, data_shape = dset.dtype.subdtype
	else:
	data_shape = dset.shape[1:]
	dtype = dset.dtype

	if dtype == np.complex64:
	# two float32s
	value_type = Value("float32")
	elif dtype == np.complex128:
	# two float64s
	value_type = Value("float64")
	else:
	logger.warning(f"Found complex dtype {dtype} that is not supported. Converting to float64...")
	value_type = Value("float64")

	return Features(
	{
	"real": _create_sized_feature_impl(data_shape, value_type),
	"imag": _create_sized_feature_impl(data_shape, value_type),
	}
	)


	def _convert_complex_to_nested(arr: np.ndarray) -> pa.StructArray:
	data = {
	"real": datasets.features.features.numpy_to_pyarrow_listarray(arr.real),
	"imag": datasets.features.features.numpy_to_pyarrow_listarray(arr.imag),
	}
	return pa.StructArray.from_arrays([data["real"], data["imag"]], names=["real", "imag"])


	# ┌────────────┐
	# │ Compound │
	# └────────────┘


	def _is_compound_dtype(dtype: np.dtype) -> bool:
	return dtype.kind == "V"


	@dataclass
	class _CompoundGroup:
	dset: "h5py.Dataset"
	data: np.ndarray = None

	def items(self):
	for field_name in self.dset.dtype.names:
	field_dtype = self.dset.dtype[field_name]
	yield field_name, _CompoundField(self.data, field_name, field_dtype)


	@dataclass
	class _CompoundField:
	data: Optional[np.ndarray]
	name: str
	dtype: np.dtype
	shape: tuple[int, ...] = field(init=False)

	def __post_init__(self):
	self.shape = (len(self.data) if self.data is not None else 0,) + self.dtype.shape

	def __getitem__(self, key):
	return self.data[key][self.name]


	def _create_compound_features(dset) -> Features:
	mock_group = _CompoundGroup(dset)
	return _recursive_infer_features(mock_group)


	def _convert_compound_to_nested(arr, dset) -> pa.StructArray:
	mock_group = _CompoundGroup(dset, data=arr)
	features = _create_compound_features(dset)
	return _recursive_load_arrays(mock_group, features, 0, len(arr))


	# ┌───────────────────┐
	# │ Variable-Length │
	# └───────────────────┘


	def _is_vlen_dtype(dtype: np.dtype) -> bool:
	if dtype.metadata and "vlen" in dtype.metadata:
	return True
	return False


	def _create_vlen_features(dset) -> Features:
	vlen_dtype = dset.dtype.metadata["vlen"]
	if vlen_dtype in (str, bytes):
	return Value("string")
	inner_feature = _np_to_pa_to_hf_value(vlen_dtype)
	return List(inner_feature)


	def _convert_vlen_to_array(arr: np.ndarray) -> pa.Array:
	return datasets.features.features.numpy_to_pyarrow_listarray(arr)


	# ┌───────────┐
	# │ Generic │
	# └───────────┘


	def _recursive_infer_features(h5_obj) -> Features:
	features_dict = {}
	for path, dset in h5_obj.items():
	if _is_group(dset):
	features = _recursive_infer_features(dset)
	if features:
	features_dict[path] = features
	elif _is_dataset(dset):
	features = _infer_feature(dset)
	if features:
	features_dict[path] = features

	return Features(features_dict)


	def _infer_feature(dset):
	if _is_complex_dtype(dset.dtype):
	return _create_complex_features(dset)
	elif _is_compound_dtype(dset.dtype) or dset.dtype.kind == "V":
	return _create_compound_features(dset)
	elif _is_vlen_dtype(dset.dtype):
	return _create_vlen_features(dset)
	return _create_sized_feature(dset)


	def _load_array(dset, path: str, start: int, end: int) -> pa.Array:
	arr = dset[start:end]

	if _is_vlen_dtype(dset.dtype):
	return _convert_vlen_to_array(arr)
	elif _is_complex_dtype(dset.dtype):
	return _convert_complex_to_nested(arr)
	elif _is_compound_dtype(dset.dtype):
	return _convert_compound_to_nested(arr, dset)
	elif dset.dtype.kind == "O":
	raise ValueError(
	f"Object dtype dataset '{path}' is not supported. "
	f"For variable-length data, please use h5py.vlen_dtype() "
	f"when creating the HDF5 file. "
	f"See: https://docs.h5py.org/en/stable/special.html#variable-length-strings"
	)
	else:
	# If any non-batch dimension is zero, emit an unsized pa.list_
	# to avoid creating FixedSizeListArray with list_size=0.
	if any(dim == 0 for dim in dset.shape[1:]):
	inner_type = pa.from_numpy_dtype(dset.dtype)
	return pa.array([[] for _ in arr], type=pa.list_(inner_type))
	else:
	return datasets.features.features.numpy_to_pyarrow_listarray(arr)


	def _recursive_load_arrays(h5_obj, features: Features, start: int, end: int):
	batch_dict = {}
	for path, dset in h5_obj.items():
	if path not in features:
	continue
	if _is_group(dset):
	arr = _recursive_load_arrays(dset, features[path], start, end)
	elif _is_dataset(dset):
	arr = _load_array(dset, path, start, end)
	else:
	raise ValueError(f"Unexpected type {type(dset)}")

	if arr is not None:
	batch_dict[path] = arr

	if _is_file(h5_obj):
	return pa.Table.from_pydict(batch_dict)

	if batch_dict:
	should_chunk, keys, values = False, [], []
	for k, v in batch_dict.items():
	if isinstance(v, pa.ChunkedArray):
	should_chunk = True
	v = v.combine_chunks()
	keys.append(k)
	values.append(v)

	sarr = pa.StructArray.from_arrays(values, names=keys)
	return pa.chunked_array(sarr) if should_chunk else sarr


	# ┌─────────────┐
	# │ Utilities │
	# └─────────────┘


	def _create_sized_feature(dset):
	dset_shape = dset.shape[1:]
	value_feature = _np_to_pa_to_hf_value(dset.dtype)
	return _create_sized_feature_impl(dset_shape, value_feature)


	def _create_sized_feature_impl(dset_shape, value_feature):
	dtype_str = value_feature.dtype
	if any(dim == 0 for dim in dset_shape):
	logger.warning(
	f"HDF5 to Arrow: Found a dataset with shape {dset_shape} and dtype {dtype_str} that has a dimension with size 0. Shape information will be lost in the conversion to List({value_feature})."
	)
	return List(value_feature)

	rank = len(dset_shape)
	if rank == 0:
	return value_feature
	elif rank == 1:
	return List(value_feature, length=dset_shape[0])
	elif rank <= 5:
	return _sized_arrayxd(rank)(shape=dset_shape, dtype=dtype_str)
	else:
	raise TypeError(f"Array{rank}D not supported. Maximum 5 dimensions allowed.")


	def _sized_arrayxd(rank: int):
	return {2: Array2D, 3: Array3D, 4: Array4D, 5: Array5D}[rank]


	def _np_to_pa_to_hf_value(numpy_dtype: np.dtype) -> Value:
	return Value(dtype=_arrow_to_datasets_dtype(pa.from_numpy_dtype(numpy_dtype)))


	def _first_dataset(h5_obj, features: Features, prefix=""):
	for path, dset in h5_obj.items():
	if path not in features:
	continue
	if _is_group(dset):
	found = _first_dataset(dset, features[path], prefix=f"{prefix}{path}/")
	if found is not None:
	return found
	elif _is_dataset(dset):
	return f"{prefix}{path}"


	def _check_dataset_lengths(h5_obj, features: Features) -> int:
	first_path = _first_dataset(h5_obj, features)
	if first_path is None:
	return None

	num_rows = h5_obj[first_path].shape[0]
	for path, dset in h5_obj.items():
	if path not in features:
	continue
	if _is_dataset(dset):
	if dset.shape[0] != num_rows:
	raise ValueError(f"Dataset '{path}' has length {dset.shape[0]} but expected {num_rows}")
	return num_rows


	def _is_group(h5_obj) -> bool:
	import h5py

	return isinstance(h5_obj, h5py.Group) or isinstance(h5_obj, _CompoundGroup)


	def _is_dataset(h5_obj) -> bool:
	import h5py

	return isinstance(h5_obj, h5py.Dataset) or isinstance(h5_obj, _CompoundField)


	def _is_file(h5_obj) -> bool:
	import h5py

	return isinstance(h5_obj, h5py.File)


	def _has_zero_dimensions(feature):
	if isinstance(feature, _ArrayXD):
	return any(dim == 0 for dim in feature.shape)
	elif isinstance(feature, List):
	return feature.length == 0 or _has_zero_dimensions(feature.feature)
	elif isinstance(feature, LargeList):
	return _has_zero_dimensions(feature.feature)
	else:
	return False