build-tools / datasets /utils /info_utils.py

Add files using upload-large-folder tool

69e1a8d verified 2 months ago

4.33 kB

	import enum
	import os
	from typing import Optional

	from huggingface_hub.utils import insecure_hashlib

	from .. import config
	from ..exceptions import (
	ExpectedMoreDownloadedFilesError,
	ExpectedMoreSplitsError,
	NonMatchingChecksumError,
	NonMatchingSplitsSizesError,
	UnexpectedDownloadedFileError,
	UnexpectedSplitsError,
	)
	from .logging import get_logger


	logger = get_logger(__name__)


	class VerificationMode(enum.Enum):
	"""`Enum` that specifies which verification checks to run.

	The default mode is `BASIC_CHECKS`, which will perform only rudimentary checks to avoid slowdowns
	when generating/downloading a dataset for the first time.

	The verification modes:

	\| \| Verification checks \|
	\|---------------------------\|------------------------------------------------------------------------------ \|
	\| `ALL_CHECKS` \| Split checks, uniqueness of the keys yielded in case of the GeneratorBuilder \|
	\| \| and the validity (number of files, checksums, etc.) of downloaded files \|
	\| `BASIC_CHECKS` (default) \| Same as `ALL_CHECKS` but without checking downloaded files \|
	\| `NO_CHECKS` \| None \|

	"""

	ALL_CHECKS = "all_checks"
	BASIC_CHECKS = "basic_checks"
	NO_CHECKS = "no_checks"


	def verify_checksums(expected_checksums: Optional[dict], recorded_checksums: dict, verification_name=None):
	if expected_checksums is None:
	logger.info("Unable to verify checksums.")
	return
	if len(set(expected_checksums) - set(recorded_checksums)) > 0:
	raise ExpectedMoreDownloadedFilesError(str(set(expected_checksums) - set(recorded_checksums)))
	if len(set(recorded_checksums) - set(expected_checksums)) > 0:
	raise UnexpectedDownloadedFileError(str(set(recorded_checksums) - set(expected_checksums)))
	bad_urls = [url for url in expected_checksums if expected_checksums[url] != recorded_checksums[url]]
	for_verification_name = " for " + verification_name if verification_name is not None else ""
	if len(bad_urls) > 0:
	raise NonMatchingChecksumError(
	f"Checksums didn't match{for_verification_name}:\n"
	f"{bad_urls}\n"
	"Set `verification_mode='no_checks'` to skip checksums verification and ignore this error"
	)
	logger.info("All the checksums matched successfully" + for_verification_name)


	def verify_splits(expected_splits: Optional[dict], recorded_splits: dict):
	if expected_splits is None:
	logger.info("Unable to verify splits sizes.")
	return
	if len(set(expected_splits) - set(recorded_splits)) > 0:
	raise ExpectedMoreSplitsError(str(set(expected_splits) - set(recorded_splits)))
	if len(set(recorded_splits) - set(expected_splits)) > 0:
	raise UnexpectedSplitsError(str(set(recorded_splits) - set(expected_splits)))
	bad_splits = [
	{"expected": expected_splits[name], "recorded": recorded_splits[name]}
	for name in expected_splits
	if expected_splits[name].num_examples != recorded_splits[name].num_examples
	]
	if len(bad_splits) > 0:
	raise NonMatchingSplitsSizesError(str(bad_splits))
	logger.info("All the splits matched successfully.")


	def get_size_checksum_dict(path: str, record_checksum: bool = True) -> dict:
	"""Compute the file size and the sha256 checksum of a file"""
	if record_checksum:
	m = insecure_hashlib.sha256()
	with open(path, "rb") as f:
	for chunk in iter(lambda: f.read(1 << 20), b""):
	m.update(chunk)
	checksum = m.hexdigest()
	else:
	checksum = None
	return {"num_bytes": os.path.getsize(path), "checksum": checksum}


	def is_small_dataset(dataset_size):
	"""Check if `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.

	Args:
	dataset_size (int): Dataset size in bytes.

	Returns:
	bool: Whether `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.
	"""
	if dataset_size and config.IN_MEMORY_MAX_SIZE:
	return dataset_size < config.IN_MEMORY_MAX_SIZE
	else:
	return False