| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """Contains utilities to manage the HF cache directory.""" |
| | import os |
| | import shutil |
| | import time |
| | from collections import defaultdict |
| | from dataclasses import dataclass |
| | from pathlib import Path |
| | from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union |
| |
|
| | from ..constants import HF_HUB_CACHE |
| | from . import logging |
| |
|
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | REPO_TYPE_T = Literal["model", "dataset", "space"] |
| |
|
| |
|
| | class CacheNotFound(Exception): |
| | """Exception thrown when the Huggingface cache is not found.""" |
| |
|
| | cache_dir: Union[str, Path] |
| |
|
| | def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs): |
| | super().__init__(msg, *args, **kwargs) |
| | self.cache_dir = cache_dir |
| |
|
| |
|
| | class CorruptedCacheException(Exception): |
| | """Exception for any unexpected structure in the Huggingface cache-system.""" |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class CachedFileInfo: |
| | """Frozen data structure holding information about a single cached file. |
| | |
| | Args: |
| | file_name (`str`): |
| | Name of the file. Example: `config.json`. |
| | file_path (`Path`): |
| | Path of the file in the `snapshots` directory. The file path is a symlink |
| | referring to a blob in the `blobs` folder. |
| | blob_path (`Path`): |
| | Path of the blob file. This is equivalent to `file_path.resolve()`. |
| | size_on_disk (`int`): |
| | Size of the blob file in bytes. |
| | blob_last_accessed (`float`): |
| | Timestamp of the last time the blob file has been accessed (from any |
| | revision). |
| | blob_last_modified (`float`): |
| | Timestamp of the last time the blob file has been modified/created. |
| | |
| | <Tip warning={true}> |
| | |
| | `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you |
| | are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) |
| | for more details. |
| | |
| | </Tip> |
| | """ |
| |
|
| | file_name: str |
| | file_path: Path |
| | blob_path: Path |
| | size_on_disk: int |
| |
|
| | blob_last_accessed: float |
| | blob_last_modified: float |
| |
|
| | @property |
| | def blob_last_accessed_str(self) -> str: |
| | """ |
| | (property) Timestamp of the last time the blob file has been accessed (from any |
| | revision), returned as a human-readable string. |
| | |
| | Example: "2 weeks ago". |
| | """ |
| | return _format_timesince(self.blob_last_accessed) |
| |
|
| | @property |
| | def blob_last_modified_str(self) -> str: |
| | """ |
| | (property) Timestamp of the last time the blob file has been modified, returned |
| | as a human-readable string. |
| | |
| | Example: "2 weeks ago". |
| | """ |
| | return _format_timesince(self.blob_last_modified) |
| |
|
| | @property |
| | def size_on_disk_str(self) -> str: |
| | """ |
| | (property) Size of the blob file as a human-readable string. |
| | |
| | Example: "42.2K". |
| | """ |
| | return _format_size(self.size_on_disk) |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class CachedRevisionInfo: |
| | """Frozen data structure holding information about a revision. |
| | |
| | A revision correspond to a folder in the `snapshots` folder and is populated with |
| | the exact tree structure as the repo on the Hub but contains only symlinks. A |
| | revision can be either referenced by 1 or more `refs` or be "detached" (no refs). |
| | |
| | Args: |
| | commit_hash (`str`): |
| | Hash of the revision (unique). |
| | Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`. |
| | snapshot_path (`Path`): |
| | Path to the revision directory in the `snapshots` folder. It contains the |
| | exact tree structure as the repo on the Hub. |
| | files: (`FrozenSet[CachedFileInfo]`): |
| | Set of [`~CachedFileInfo`] describing all files contained in the snapshot. |
| | refs (`FrozenSet[str]`): |
| | Set of `refs` pointing to this revision. If the revision has no `refs`, it |
| | is considered detached. |
| | Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`. |
| | size_on_disk (`int`): |
| | Sum of the blob file sizes that are symlink-ed by the revision. |
| | last_modified (`float`): |
| | Timestamp of the last time the revision has been created/modified. |
| | |
| | <Tip warning={true}> |
| | |
| | `last_accessed` cannot be determined correctly on a single revision as blob files |
| | are shared across revisions. |
| | |
| | </Tip> |
| | |
| | <Tip warning={true}> |
| | |
| | `size_on_disk` is not necessarily the sum of all file sizes because of possible |
| | duplicated files. Besides, only blobs are taken into account, not the (negligible) |
| | size of folders and symlinks. |
| | |
| | </Tip> |
| | """ |
| |
|
| | commit_hash: str |
| | snapshot_path: Path |
| | size_on_disk: int |
| | files: FrozenSet[CachedFileInfo] |
| | refs: FrozenSet[str] |
| |
|
| | last_modified: float |
| |
|
| | @property |
| | def last_modified_str(self) -> str: |
| | """ |
| | (property) Timestamp of the last time the revision has been modified, returned |
| | as a human-readable string. |
| | |
| | Example: "2 weeks ago". |
| | """ |
| | return _format_timesince(self.last_modified) |
| |
|
| | @property |
| | def size_on_disk_str(self) -> str: |
| | """ |
| | (property) Sum of the blob file sizes as a human-readable string. |
| | |
| | Example: "42.2K". |
| | """ |
| | return _format_size(self.size_on_disk) |
| |
|
| | @property |
| | def nb_files(self) -> int: |
| | """ |
| | (property) Total number of files in the revision. |
| | """ |
| | return len(self.files) |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class CachedRepoInfo: |
| | """Frozen data structure holding information about a cached repository. |
| | |
| | Args: |
| | repo_id (`str`): |
| | Repo id of the repo on the Hub. Example: `"google/fleurs"`. |
| | repo_type (`Literal["dataset", "model", "space"]`): |
| | Type of the cached repo. |
| | repo_path (`Path`): |
| | Local path to the cached repo. |
| | size_on_disk (`int`): |
| | Sum of the blob file sizes in the cached repo. |
| | nb_files (`int`): |
| | Total number of blob files in the cached repo. |
| | revisions (`FrozenSet[CachedRevisionInfo]`): |
| | Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo. |
| | last_accessed (`float`): |
| | Timestamp of the last time a blob file of the repo has been accessed. |
| | last_modified (`float`): |
| | Timestamp of the last time a blob file of the repo has been modified/created. |
| | |
| | <Tip warning={true}> |
| | |
| | `size_on_disk` is not necessarily the sum of all revisions sizes because of |
| | duplicated files. Besides, only blobs are taken into account, not the (negligible) |
| | size of folders and symlinks. |
| | |
| | </Tip> |
| | |
| | <Tip warning={true}> |
| | |
| | `last_accessed` and `last_modified` reliability can depend on the OS you are using. |
| | See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) |
| | for more details. |
| | |
| | </Tip> |
| | """ |
| |
|
| | repo_id: str |
| | repo_type: REPO_TYPE_T |
| | repo_path: Path |
| | size_on_disk: int |
| | nb_files: int |
| | revisions: FrozenSet[CachedRevisionInfo] |
| |
|
| | last_accessed: float |
| | last_modified: float |
| |
|
| | @property |
| | def last_accessed_str(self) -> str: |
| | """ |
| | (property) Last time a blob file of the repo has been accessed, returned as a |
| | human-readable string. |
| | |
| | Example: "2 weeks ago". |
| | """ |
| | return _format_timesince(self.last_accessed) |
| |
|
| | @property |
| | def last_modified_str(self) -> str: |
| | """ |
| | (property) Last time a blob file of the repo has been modified, returned as a |
| | human-readable string. |
| | |
| | Example: "2 weeks ago". |
| | """ |
| | return _format_timesince(self.last_modified) |
| |
|
| | @property |
| | def size_on_disk_str(self) -> str: |
| | """ |
| | (property) Sum of the blob file sizes as a human-readable string. |
| | |
| | Example: "42.2K". |
| | """ |
| | return _format_size(self.size_on_disk) |
| |
|
| | @property |
| | def refs(self) -> Dict[str, CachedRevisionInfo]: |
| | """ |
| | (property) Mapping between `refs` and revision data structures. |
| | """ |
| | return {ref: revision for revision in self.revisions for ref in revision.refs} |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class DeleteCacheStrategy: |
| | """Frozen data structure holding the strategy to delete cached revisions. |
| | |
| | This object is not meant to be instantiated programmatically but to be returned by |
| | [`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example. |
| | |
| | Args: |
| | expected_freed_size (`float`): |
| | Expected freed size once strategy is executed. |
| | blobs (`FrozenSet[Path]`): |
| | Set of blob file paths to be deleted. |
| | refs (`FrozenSet[Path]`): |
| | Set of reference file paths to be deleted. |
| | repos (`FrozenSet[Path]`): |
| | Set of entire repo paths to be deleted. |
| | snapshots (`FrozenSet[Path]`): |
| | Set of snapshots to be deleted (directory of symlinks). |
| | """ |
| |
|
| | expected_freed_size: int |
| | blobs: FrozenSet[Path] |
| | refs: FrozenSet[Path] |
| | repos: FrozenSet[Path] |
| | snapshots: FrozenSet[Path] |
| |
|
| | @property |
| | def expected_freed_size_str(self) -> str: |
| | """ |
| | (property) Expected size that will be freed as a human-readable string. |
| | |
| | Example: "42.2K". |
| | """ |
| | return _format_size(self.expected_freed_size) |
| |
|
| | def execute(self) -> None: |
| | """Execute the defined strategy. |
| | |
| | <Tip warning={true}> |
| | |
| | If this method is interrupted, the cache might get corrupted. Deletion order is |
| | implemented so that references and symlinks are deleted before the actual blob |
| | files. |
| | |
| | </Tip> |
| | |
| | <Tip warning={true}> |
| | |
| | This method is irreversible. If executed, cached files are erased and must be |
| | downloaded again. |
| | |
| | </Tip> |
| | """ |
| | |
| | |
| | |
| |
|
| | |
| | for path in self.repos: |
| | _try_delete_path(path, path_type="repo") |
| |
|
| | |
| | for path in self.snapshots: |
| | _try_delete_path(path, path_type="snapshot") |
| |
|
| | |
| | for path in self.refs: |
| | _try_delete_path(path, path_type="ref") |
| |
|
| | |
| | for path in self.blobs: |
| | _try_delete_path(path, path_type="blob") |
| |
|
| | logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.") |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class HFCacheInfo: |
| | """Frozen data structure holding information about the entire cache-system. |
| | |
| | This data structure is returned by [`scan_cache_dir`] and is immutable. |
| | |
| | Args: |
| | size_on_disk (`int`): |
| | Sum of all valid repo sizes in the cache-system. |
| | repos (`FrozenSet[CachedRepoInfo]`): |
| | Set of [`~CachedRepoInfo`] describing all valid cached repos found on the |
| | cache-system while scanning. |
| | warnings (`List[CorruptedCacheException]`): |
| | List of [`~CorruptedCacheException`] that occurred while scanning the cache. |
| | Those exceptions are captured so that the scan can continue. Corrupted repos |
| | are skipped from the scan. |
| | |
| | <Tip warning={true}> |
| | |
| | Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if |
| | some cached repos are corrupted, their sizes are not taken into account. |
| | |
| | </Tip> |
| | """ |
| |
|
| | size_on_disk: int |
| | repos: FrozenSet[CachedRepoInfo] |
| | warnings: List[CorruptedCacheException] |
| |
|
| | @property |
| | def size_on_disk_str(self) -> str: |
| | """ |
| | (property) Sum of all valid repo sizes in the cache-system as a human-readable |
| | string. |
| | |
| | Example: "42.2K". |
| | """ |
| | return _format_size(self.size_on_disk) |
| |
|
| | def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy: |
| | """Prepare the strategy to delete one or more revisions cached locally. |
| | |
| | Input revisions can be any revision hash. If a revision hash is not found in the |
| | local cache, a warning is thrown but no error is raised. Revisions can be from |
| | different cached repos since hashes are unique across repos, |
| | |
| | Examples: |
| | ```py |
| | >>> from huggingface_hub import scan_cache_dir |
| | >>> cache_info = scan_cache_dir() |
| | >>> delete_strategy = cache_info.delete_revisions( |
| | ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa" |
| | ... ) |
| | >>> print(f"Will free {delete_strategy.expected_freed_size_str}.") |
| | Will free 7.9K. |
| | >>> delete_strategy.execute() |
| | Cache deletion done. Saved 7.9K. |
| | ``` |
| | |
| | ```py |
| | >>> from huggingface_hub import scan_cache_dir |
| | >>> scan_cache_dir().delete_revisions( |
| | ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa", |
| | ... "e2983b237dccf3ab4937c97fa717319a9ca1a96d", |
| | ... "6c0e6080953db56375760c0471a8c5f2929baf11", |
| | ... ).execute() |
| | Cache deletion done. Saved 8.6G. |
| | ``` |
| | |
| | <Tip warning={true}> |
| | |
| | `delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to |
| | be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but |
| | allows having a dry run before actually executing the deletion. |
| | |
| | </Tip> |
| | """ |
| | hashes_to_delete: Set[str] = set(revisions) |
| |
|
| | repos_with_revisions: Dict[CachedRepoInfo, Set[CachedRevisionInfo]] = defaultdict(set) |
| |
|
| | for repo in self.repos: |
| | for revision in repo.revisions: |
| | if revision.commit_hash in hashes_to_delete: |
| | repos_with_revisions[repo].add(revision) |
| | hashes_to_delete.remove(revision.commit_hash) |
| |
|
| | if len(hashes_to_delete) > 0: |
| | logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}") |
| |
|
| | delete_strategy_blobs: Set[Path] = set() |
| | delete_strategy_refs: Set[Path] = set() |
| | delete_strategy_repos: Set[Path] = set() |
| | delete_strategy_snapshots: Set[Path] = set() |
| | delete_strategy_expected_freed_size = 0 |
| |
|
| | for affected_repo, revisions_to_delete in repos_with_revisions.items(): |
| | other_revisions = affected_repo.revisions - revisions_to_delete |
| |
|
| | |
| | |
| | if len(other_revisions) == 0: |
| | delete_strategy_repos.add(affected_repo.repo_path) |
| | delete_strategy_expected_freed_size += affected_repo.size_on_disk |
| | continue |
| |
|
| | |
| | |
| | for revision_to_delete in revisions_to_delete: |
| | |
| | delete_strategy_snapshots.add(revision_to_delete.snapshot_path) |
| |
|
| | |
| | for ref in revision_to_delete.refs: |
| | delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref) |
| |
|
| | |
| | for file in revision_to_delete.files: |
| | if file.blob_path not in delete_strategy_blobs: |
| | is_file_alone = True |
| | for revision in other_revisions: |
| | for rev_file in revision.files: |
| | if file.blob_path == rev_file.blob_path: |
| | is_file_alone = False |
| | break |
| | if not is_file_alone: |
| | break |
| |
|
| | |
| | if is_file_alone: |
| | delete_strategy_blobs.add(file.blob_path) |
| | delete_strategy_expected_freed_size += file.size_on_disk |
| |
|
| | |
| | return DeleteCacheStrategy( |
| | blobs=frozenset(delete_strategy_blobs), |
| | refs=frozenset(delete_strategy_refs), |
| | repos=frozenset(delete_strategy_repos), |
| | snapshots=frozenset(delete_strategy_snapshots), |
| | expected_freed_size=delete_strategy_expected_freed_size, |
| | ) |
| |
|
| |
|
| | def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo: |
| | """Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure. |
| | |
| | Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache |
| | will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`] |
| | will be thrown internally but captured and returned in the [`~HFCacheInfo`] |
| | structure. Only valid repos get a proper report. |
| | |
| | ```py |
| | >>> from huggingface_hub import scan_cache_dir |
| | |
| | >>> hf_cache_info = scan_cache_dir() |
| | HFCacheInfo( |
| | size_on_disk=3398085269, |
| | repos=frozenset({ |
| | CachedRepoInfo( |
| | repo_id='t5-small', |
| | repo_type='model', |
| | repo_path=PosixPath(...), |
| | size_on_disk=970726914, |
| | nb_files=11, |
| | revisions=frozenset({ |
| | CachedRevisionInfo( |
| | commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5', |
| | size_on_disk=970726339, |
| | snapshot_path=PosixPath(...), |
| | files=frozenset({ |
| | CachedFileInfo( |
| | file_name='config.json', |
| | size_on_disk=1197 |
| | file_path=PosixPath(...), |
| | blob_path=PosixPath(...), |
| | ), |
| | CachedFileInfo(...), |
| | ... |
| | }), |
| | ), |
| | CachedRevisionInfo(...), |
| | ... |
| | }), |
| | ), |
| | CachedRepoInfo(...), |
| | ... |
| | }), |
| | warnings=[ |
| | CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."), |
| | CorruptedCacheException(...), |
| | ... |
| | ], |
| | ) |
| | ``` |
| | |
| | You can also print a detailed report directly from the `huggingface-cli` using: |
| | ```text |
| | > huggingface-cli scan-cache |
| | REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH |
| | --------------------------- --------- ------------ -------- ------------------- ------------------------------------------------------------------------- |
| | glue dataset 116.3K 15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue |
| | google/fleurs dataset 64.9M 6 main, refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs |
| | Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner |
| | bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased |
| | t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base |
| | t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small |
| | |
| | Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. |
| | Got 1 warning(s) while scanning. Use -vvv to print details. |
| | ``` |
| | |
| | Args: |
| | cache_dir (`str` or `Path`, `optional`): |
| | Cache directory to cache. Defaults to the default HF cache directory. |
| | |
| | <Tip warning={true}> |
| | |
| | Raises: |
| | |
| | `CacheNotFound` |
| | If the cache directory does not exist. |
| | |
| | [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) |
| | If the cache directory is a file, instead of a directory. |
| | |
| | </Tip> |
| | |
| | Returns: a [`~HFCacheInfo`] object. |
| | """ |
| | if cache_dir is None: |
| | cache_dir = HF_HUB_CACHE |
| |
|
| | cache_dir = Path(cache_dir).expanduser().resolve() |
| | if not cache_dir.exists(): |
| | raise CacheNotFound( |
| | f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.", |
| | cache_dir=cache_dir, |
| | ) |
| |
|
| | if cache_dir.is_file(): |
| | raise ValueError( |
| | f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable." |
| | ) |
| |
|
| | repos: Set[CachedRepoInfo] = set() |
| | warnings: List[CorruptedCacheException] = [] |
| | for repo_path in cache_dir.iterdir(): |
| | if repo_path.name == ".locks": |
| | continue |
| | try: |
| | repos.add(_scan_cached_repo(repo_path)) |
| | except CorruptedCacheException as e: |
| | warnings.append(e) |
| |
|
| | return HFCacheInfo( |
| | repos=frozenset(repos), |
| | size_on_disk=sum(repo.size_on_disk for repo in repos), |
| | warnings=warnings, |
| | ) |
| |
|
| |
|
| | def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: |
| | """Scan a single cache repo and return information about it. |
| | |
| | Any unexpected behavior will raise a [`~CorruptedCacheException`]. |
| | """ |
| | if not repo_path.is_dir(): |
| | raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}") |
| |
|
| | if "--" not in repo_path.name: |
| | raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}") |
| |
|
| | repo_type, repo_id = repo_path.name.split("--", maxsplit=1) |
| | repo_type = repo_type[:-1] |
| | repo_id = repo_id.replace("--", "/") |
| |
|
| | if repo_type not in {"dataset", "model", "space"}: |
| | raise CorruptedCacheException( |
| | f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})." |
| | ) |
| |
|
| | blob_stats: Dict[Path, os.stat_result] = {} |
| |
|
| | snapshots_path = repo_path / "snapshots" |
| | refs_path = repo_path / "refs" |
| |
|
| | if not snapshots_path.exists() or not snapshots_path.is_dir(): |
| | raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}") |
| |
|
| | |
| |
|
| | |
| | refs_by_hash: Dict[str, Set[str]] = defaultdict(set) |
| | if refs_path.exists(): |
| | |
| | |
| | |
| | |
| | |
| | |
| | if refs_path.is_file(): |
| | raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}") |
| |
|
| | for ref_path in refs_path.glob("**/*"): |
| | |
| | if ref_path.is_dir(): |
| | continue |
| |
|
| | ref_name = str(ref_path.relative_to(refs_path)) |
| | with ref_path.open() as f: |
| | commit_hash = f.read() |
| |
|
| | refs_by_hash[commit_hash].add(ref_name) |
| |
|
| | |
| | cached_revisions: Set[CachedRevisionInfo] = set() |
| | for revision_path in snapshots_path.iterdir(): |
| | if revision_path.is_file(): |
| | raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}") |
| |
|
| | cached_files = set() |
| | for file_path in revision_path.glob("**/*"): |
| | |
| | if file_path.is_dir(): |
| | continue |
| |
|
| | blob_path = Path(file_path).resolve() |
| | if not blob_path.exists(): |
| | raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}") |
| |
|
| | if blob_path not in blob_stats: |
| | blob_stats[blob_path] = blob_path.stat() |
| |
|
| | cached_files.add( |
| | CachedFileInfo( |
| | file_name=file_path.name, |
| | file_path=file_path, |
| | size_on_disk=blob_stats[blob_path].st_size, |
| | blob_path=blob_path, |
| | blob_last_accessed=blob_stats[blob_path].st_atime, |
| | blob_last_modified=blob_stats[blob_path].st_mtime, |
| | ) |
| | ) |
| |
|
| | |
| | |
| | if len(cached_files) > 0: |
| | revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files) |
| | else: |
| | revision_last_modified = revision_path.stat().st_mtime |
| |
|
| | cached_revisions.add( |
| | CachedRevisionInfo( |
| | commit_hash=revision_path.name, |
| | files=frozenset(cached_files), |
| | refs=frozenset(refs_by_hash.pop(revision_path.name, set())), |
| | size_on_disk=sum( |
| | blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files) |
| | ), |
| | snapshot_path=revision_path, |
| | last_modified=revision_last_modified, |
| | ) |
| | ) |
| |
|
| | |
| | if len(refs_by_hash) > 0: |
| | raise CorruptedCacheException( |
| | f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})." |
| | ) |
| |
|
| | |
| | |
| | if len(blob_stats) > 0: |
| | repo_last_accessed = max(stat.st_atime for stat in blob_stats.values()) |
| | repo_last_modified = max(stat.st_mtime for stat in blob_stats.values()) |
| | else: |
| | repo_stats = repo_path.stat() |
| | repo_last_accessed = repo_stats.st_atime |
| | repo_last_modified = repo_stats.st_mtime |
| |
|
| | |
| | return CachedRepoInfo( |
| | nb_files=len(blob_stats), |
| | repo_id=repo_id, |
| | repo_path=repo_path, |
| | repo_type=repo_type, |
| | revisions=frozenset(cached_revisions), |
| | size_on_disk=sum(stat.st_size for stat in blob_stats.values()), |
| | last_accessed=repo_last_accessed, |
| | last_modified=repo_last_modified, |
| | ) |
| |
|
| |
|
| | def _format_size(num: int) -> str: |
| | """Format size in bytes into a human-readable string. |
| | |
| | Taken from https://stackoverflow.com/a/1094933 |
| | """ |
| | num_f = float(num) |
| | for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: |
| | if abs(num_f) < 1000.0: |
| | return f"{num_f:3.1f}{unit}" |
| | num_f /= 1000.0 |
| | return f"{num_f:.1f}Y" |
| |
|
| |
|
| | _TIMESINCE_CHUNKS = ( |
| | |
| | ("second", 1, 60), |
| | ("minute", 60, 60), |
| | ("hour", 60 * 60, 24), |
| | ("day", 60 * 60 * 24, 6), |
| | ("week", 60 * 60 * 24 * 7, 6), |
| | ("month", 60 * 60 * 24 * 30, 11), |
| | ("year", 60 * 60 * 24 * 365, None), |
| | ) |
| |
|
| |
|
| | def _format_timesince(ts: float) -> str: |
| | """Format timestamp in seconds into a human-readable string, relative to now. |
| | |
| | Vaguely inspired by Django's `timesince` formatter. |
| | """ |
| | delta = time.time() - ts |
| | if delta < 20: |
| | return "a few seconds ago" |
| | for label, divider, max_value in _TIMESINCE_CHUNKS: |
| | value = round(delta / divider) |
| | if max_value is not None and value <= max_value: |
| | break |
| | return f"{value} {label}{'s' if value > 1 else ''} ago" |
| |
|
| |
|
| | def _try_delete_path(path: Path, path_type: str) -> None: |
| | """Try to delete a local file or folder. |
| | |
| | If the path does not exists, error is logged as a warning and then ignored. |
| | |
| | Args: |
| | path (`Path`) |
| | Path to delete. Can be a file or a folder. |
| | path_type (`str`) |
| | What path are we deleting ? Only for logging purposes. Example: "snapshot". |
| | """ |
| | logger.info(f"Delete {path_type}: {path}") |
| | try: |
| | if path.is_file(): |
| | os.remove(path) |
| | else: |
| | shutil.rmtree(path) |
| | except FileNotFoundError: |
| | logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True) |
| | except PermissionError: |
| | logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True) |
| |
|