| import enum | |
| import os | |
| from typing import Optional | |
| from huggingface_hub.utils import insecure_hashlib | |
| from .. import config | |
| from ..exceptions import ( | |
| ExpectedMoreDownloadedFilesError, | |
| ExpectedMoreSplitsError, | |
| NonMatchingChecksumError, | |
| NonMatchingSplitsSizesError, | |
| UnexpectedDownloadedFileError, | |
| UnexpectedSplitsError, | |
| ) | |
| from .logging import get_logger | |
| logger = get_logger(__name__) | |
| class VerificationMode(enum.Enum): | |
| """`Enum` that specifies which verification checks to run. | |
| The default mode is `BASIC_CHECKS`, which will perform only rudimentary checks to avoid slowdowns | |
| when generating/downloading a dataset for the first time. | |
| The verification modes: | |
| | | Verification checks | | |
| |---------------------------|------------------------------------------------------------------------------ | | |
| | `ALL_CHECKS` | Split checks, uniqueness of the keys yielded in case of the GeneratorBuilder | | |
| | | and the validity (number of files, checksums, etc.) of downloaded files | | |
| | `BASIC_CHECKS` (default) | Same as `ALL_CHECKS` but without checking downloaded files | | |
| | `NO_CHECKS` | None | | |
| """ | |
| ALL_CHECKS = "all_checks" | |
| BASIC_CHECKS = "basic_checks" | |
| NO_CHECKS = "no_checks" | |
| def verify_checksums(expected_checksums: Optional[dict], recorded_checksums: dict, verification_name=None): | |
| if expected_checksums is None: | |
| logger.info("Unable to verify checksums.") | |
| return | |
| if len(set(expected_checksums) - set(recorded_checksums)) > 0: | |
| raise ExpectedMoreDownloadedFilesError(str(set(expected_checksums) - set(recorded_checksums))) | |
| if len(set(recorded_checksums) - set(expected_checksums)) > 0: | |
| raise UnexpectedDownloadedFileError(str(set(recorded_checksums) - set(expected_checksums))) | |
| bad_urls = [url for url in expected_checksums if expected_checksums[url] != recorded_checksums[url]] | |
| for_verification_name = " for " + verification_name if verification_name is not None else "" | |
| if len(bad_urls) > 0: | |
| raise NonMatchingChecksumError( | |
| f"Checksums didn't match{for_verification_name}:\n" | |
| f"{bad_urls}\n" | |
| "Set `verification_mode='no_checks'` to skip checksums verification and ignore this error" | |
| ) | |
| logger.info("All the checksums matched successfully" + for_verification_name) | |
| def verify_splits(expected_splits: Optional[dict], recorded_splits: dict): | |
| if expected_splits is None: | |
| logger.info("Unable to verify splits sizes.") | |
| return | |
| if len(set(expected_splits) - set(recorded_splits)) > 0: | |
| raise ExpectedMoreSplitsError(str(set(expected_splits) - set(recorded_splits))) | |
| if len(set(recorded_splits) - set(expected_splits)) > 0: | |
| raise UnexpectedSplitsError(str(set(recorded_splits) - set(expected_splits))) | |
| bad_splits = [ | |
| {"expected": expected_splits[name], "recorded": recorded_splits[name]} | |
| for name in expected_splits | |
| if expected_splits[name].num_examples != recorded_splits[name].num_examples | |
| ] | |
| if len(bad_splits) > 0: | |
| raise NonMatchingSplitsSizesError(str(bad_splits)) | |
| logger.info("All the splits matched successfully.") | |
| def get_size_checksum_dict(path: str, record_checksum: bool = True) -> dict: | |
| """Compute the file size and the sha256 checksum of a file""" | |
| if record_checksum: | |
| m = insecure_hashlib.sha256() | |
| with open(path, "rb") as f: | |
| for chunk in iter(lambda: f.read(1 << 20), b""): | |
| m.update(chunk) | |
| checksum = m.hexdigest() | |
| else: | |
| checksum = None | |
| return {"num_bytes": os.path.getsize(path), "checksum": checksum} | |
| def is_small_dataset(dataset_size): | |
| """Check if `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`. | |
| Args: | |
| dataset_size (int): Dataset size in bytes. | |
| Returns: | |
| bool: Whether `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`. | |
| """ | |
| if dataset_size and config.IN_MEMORY_MAX_SIZE: | |
| return dataset_size < config.IN_MEMORY_MAX_SIZE | |
| else: | |
| return False | |