dheeena commited on
Commit
b4e8784
·
verified ·
1 Parent(s): 7cb8c9d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/INSTALLER +1 -0
  2. venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/METADATA +78 -0
  3. venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/RECORD +14 -0
  4. venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/WHEEL +5 -0
  5. venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/top_level.txt +1 -0
  6. venv/lib/python3.13/site-packages/certifi/__init__.py +4 -0
  7. venv/lib/python3.13/site-packages/certifi/__main__.py +12 -0
  8. venv/lib/python3.13/site-packages/certifi/cacert.pem +0 -0
  9. venv/lib/python3.13/site-packages/certifi/core.py +83 -0
  10. venv/lib/python3.13/site-packages/certifi/py.typed +0 -0
  11. venv/lib/python3.13/site-packages/charset_normalizer/__init__.py +48 -0
  12. venv/lib/python3.13/site-packages/charset_normalizer/__main__.py +6 -0
  13. venv/lib/python3.13/site-packages/charset_normalizer/api.py +669 -0
  14. venv/lib/python3.13/site-packages/charset_normalizer/cd.py +395 -0
  15. venv/lib/python3.13/site-packages/charset_normalizer/constant.py +2015 -0
  16. venv/lib/python3.13/site-packages/charset_normalizer/legacy.py +80 -0
  17. venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-x86_64-linux-gnu.so +0 -0
  18. venv/lib/python3.13/site-packages/charset_normalizer/md.py +635 -0
  19. venv/lib/python3.13/site-packages/charset_normalizer/models.py +360 -0
  20. venv/lib/python3.13/site-packages/charset_normalizer/py.typed +0 -0
  21. venv/lib/python3.13/site-packages/charset_normalizer/utils.py +414 -0
  22. venv/lib/python3.13/site-packages/charset_normalizer/version.py +8 -0
  23. venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/INSTALLER +1 -0
  24. venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/METADATA +42 -0
  25. venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/RECORD +24 -0
  26. venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/WHEEL +4 -0
  27. venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/INSTALLER +1 -0
  28. venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/METADATA +87 -0
  29. venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/RECORD +8 -0
  30. venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/WHEEL +4 -0
  31. venv/lib/python3.13/site-packages/huggingface_hub/__init__.py +1554 -0
  32. venv/lib/python3.13/site-packages/huggingface_hub/_commit_api.py +968 -0
  33. venv/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py +350 -0
  34. venv/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py +413 -0
  35. venv/lib/python3.13/site-packages/huggingface_hub/_jobs_api.py +301 -0
  36. venv/lib/python3.13/site-packages/huggingface_hub/_local_folder.py +447 -0
  37. venv/lib/python3.13/site-packages/huggingface_hub/_login.py +514 -0
  38. venv/lib/python3.13/site-packages/huggingface_hub/_oauth.py +460 -0
  39. venv/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py +343 -0
  40. venv/lib/python3.13/site-packages/huggingface_hub/_space_api.py +168 -0
  41. venv/lib/python3.13/site-packages/huggingface_hub/_tensorboard_logger.py +190 -0
  42. venv/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py +755 -0
  43. venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_payload.py +137 -0
  44. venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py +376 -0
  45. venv/lib/python3.13/site-packages/huggingface_hub/community.py +363 -0
  46. venv/lib/python3.13/site-packages/huggingface_hub/constants.py +294 -0
  47. venv/lib/python3.13/site-packages/huggingface_hub/dataclasses.py +484 -0
  48. venv/lib/python3.13/site-packages/huggingface_hub/errors.py +377 -0
  49. venv/lib/python3.13/site-packages/huggingface_hub/fastai_utils.py +415 -0
  50. venv/lib/python3.13/site-packages/huggingface_hub/file_download.py +1813 -0
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/METADATA ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: certifi
3
+ Version: 2025.11.12
4
+ Summary: Python package for providing Mozilla's CA Bundle.
5
+ Home-page: https://github.com/certifi/python-certifi
6
+ Author: Kenneth Reitz
7
+ Author-email: me@kennethreitz.com
8
+ License: MPL-2.0
9
+ Project-URL: Source, https://github.com/certifi/python-certifi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
+ Classifier: Natural Language :: English
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Programming Language :: Python :: 3.14
25
+ Requires-Python: >=3.7
26
+ License-File: LICENSE
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: license
33
+ Dynamic: license-file
34
+ Dynamic: project-url
35
+ Dynamic: requires-python
36
+ Dynamic: summary
37
+
38
+ Certifi: Python SSL Certificates
39
+ ================================
40
+
41
+ Certifi provides Mozilla's carefully curated collection of Root Certificates for
42
+ validating the trustworthiness of SSL certificates while verifying the identity
43
+ of TLS hosts. It has been extracted from the `Requests`_ project.
44
+
45
+ Installation
46
+ ------------
47
+
48
+ ``certifi`` is available on PyPI. Simply install it with ``pip``::
49
+
50
+ $ pip install certifi
51
+
52
+ Usage
53
+ -----
54
+
55
+ To reference the installed certificate authority (CA) bundle, you can use the
56
+ built-in function::
57
+
58
+ >>> import certifi
59
+
60
+ >>> certifi.where()
61
+ '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
62
+
63
+ Or from the command line::
64
+
65
+ $ python -m certifi
66
+ /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
67
+
68
+ Enjoy!
69
+
70
+ .. _`Requests`: https://requests.readthedocs.io/en/master/
71
+
72
+ Addition/Removal of Certificates
73
+ --------------------------------
74
+
75
+ Certifi does not support any addition/removal or other modification of the
76
+ CA trust store content. This project is intended to provide a reliable and
77
+ highly portable root of trust to python deployments. Look to upstream projects
78
+ for methods to use alternate trust.
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/RECORD ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi-2025.11.12.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ certifi-2025.11.12.dist-info/METADATA,sha256=_JprGu_1lWSdHlruRBKcorXnrfvBDhvX_6KRr8HQbLc,2475
3
+ certifi-2025.11.12.dist-info/RECORD,,
4
+ certifi-2025.11.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
5
+ certifi-2025.11.12.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
6
+ certifi-2025.11.12.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
+ certifi/__init__.py,sha256=1BRSxNMnZW7CZ2oJtYWLoJgfHfcB9i273exwiPwfjJM,94
8
+ certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
+ certifi/__pycache__/__init__.cpython-313.pyc,,
10
+ certifi/__pycache__/__main__.cpython-313.pyc,,
11
+ certifi/__pycache__/core.cpython-313.pyc,,
12
+ certifi/cacert.pem,sha256=oa1dZD4hxDtb7XTH4IkdzbWPavUcis4eTwINZUqlKhY,283932
13
+ certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
14
+ certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ certifi
venv/lib/python3.13/site-packages/certifi/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .core import contents, where
2
+
3
+ __all__ = ["contents", "where"]
4
+ __version__ = "2025.11.12"
venv/lib/python3.13/site-packages/certifi/__main__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from certifi import contents, where
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("-c", "--contents", action="store_true")
7
+ args = parser.parse_args()
8
+
9
+ if args.contents:
10
+ print(contents())
11
+ else:
12
+ print(where())
venv/lib/python3.13/site-packages/certifi/cacert.pem ADDED
The diff for this file is too large to render. See raw diff
 
venv/lib/python3.13/site-packages/certifi/core.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ certifi.py
3
+ ~~~~~~~~~~
4
+
5
+ This module returns the installation location of cacert.pem or its contents.
6
+ """
7
+ import sys
8
+ import atexit
9
+
10
+ def exit_cacert_ctx() -> None:
11
+ _CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
12
+
13
+
14
+ if sys.version_info >= (3, 11):
15
+
16
+ from importlib.resources import as_file, files
17
+
18
+ _CACERT_CTX = None
19
+ _CACERT_PATH = None
20
+
21
+ def where() -> str:
22
+ # This is slightly terrible, but we want to delay extracting the file
23
+ # in cases where we're inside of a zipimport situation until someone
24
+ # actually calls where(), but we don't want to re-extract the file
25
+ # on every call of where(), so we'll do it once then store it in a
26
+ # global variable.
27
+ global _CACERT_CTX
28
+ global _CACERT_PATH
29
+ if _CACERT_PATH is None:
30
+ # This is slightly janky, the importlib.resources API wants you to
31
+ # manage the cleanup of this file, so it doesn't actually return a
32
+ # path, it returns a context manager that will give you the path
33
+ # when you enter it and will do any cleanup when you leave it. In
34
+ # the common case of not needing a temporary file, it will just
35
+ # return the file system location and the __exit__() is a no-op.
36
+ #
37
+ # We also have to hold onto the actual context manager, because
38
+ # it will do the cleanup whenever it gets garbage collected, so
39
+ # we will also store that at the global level as well.
40
+ _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
41
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
42
+ atexit.register(exit_cacert_ctx)
43
+
44
+ return _CACERT_PATH
45
+
46
+ def contents() -> str:
47
+ return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
48
+
49
+ else:
50
+
51
+ from importlib.resources import path as get_path, read_text
52
+
53
+ _CACERT_CTX = None
54
+ _CACERT_PATH = None
55
+
56
+ def where() -> str:
57
+ # This is slightly terrible, but we want to delay extracting the
58
+ # file in cases where we're inside of a zipimport situation until
59
+ # someone actually calls where(), but we don't want to re-extract
60
+ # the file on every call of where(), so we'll do it once then store
61
+ # it in a global variable.
62
+ global _CACERT_CTX
63
+ global _CACERT_PATH
64
+ if _CACERT_PATH is None:
65
+ # This is slightly janky, the importlib.resources API wants you
66
+ # to manage the cleanup of this file, so it doesn't actually
67
+ # return a path, it returns a context manager that will give
68
+ # you the path when you enter it and will do any cleanup when
69
+ # you leave it. In the common case of not needing a temporary
70
+ # file, it will just return the file system location and the
71
+ # __exit__() is a no-op.
72
+ #
73
+ # We also have to hold onto the actual context manager, because
74
+ # it will do the cleanup whenever it gets garbage collected, so
75
+ # we will also store that at the global level as well.
76
+ _CACERT_CTX = get_path("certifi", "cacert.pem")
77
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
78
+ atexit.register(exit_cacert_ctx)
79
+
80
+ return _CACERT_PATH
81
+
82
+ def contents() -> str:
83
+ return read_text("certifi", "cacert.pem", encoding="ascii")
venv/lib/python3.13/site-packages/certifi/py.typed ADDED
File without changes
venv/lib/python3.13/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
venv/lib/python3.13/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
venv/lib/python3.13/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
+ from .md import mess_ratio
15
+ from .models import CharsetMatch, CharsetMatches
16
+ from .utils import (
17
+ any_specified_encoding,
18
+ cut_sequence_chunks,
19
+ iana_name,
20
+ identify_sig_or_bom,
21
+ is_cp_similar,
22
+ is_multi_byte_encoding,
23
+ should_strip_sig_or_bom,
24
+ )
25
+
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: bytes | bytearray,
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: list[str] | None = None,
39
+ cp_exclusion: list[str] | None = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain: # Defensive: ensure exit path clean handler
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: list[str] = []
139
+
140
+ specified_encoding: str | None = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: set[str] = set()
153
+ tested_but_hard_failure: list[str] = []
154
+ tested_but_soft_failure: list[str] = []
155
+
156
+ fallback_ascii: CharsetMatch | None = None
157
+ fallback_u8: CharsetMatch | None = None
158
+ fallback_specified: CharsetMatch | None = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ early_stop_results: CharsetMatches = CharsetMatches()
163
+
164
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
+
166
+ if sig_encoding is not None:
167
+ prioritized_encodings.append(sig_encoding)
168
+ logger.log(
169
+ TRACE,
170
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
+ len(sig_payload),
172
+ sig_encoding,
173
+ )
174
+
175
+ prioritized_encodings.append("ascii")
176
+
177
+ if "utf_8" not in prioritized_encodings:
178
+ prioritized_encodings.append("utf_8")
179
+
180
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
+ if cp_isolation and encoding_iana not in cp_isolation:
182
+ continue
183
+
184
+ if cp_exclusion and encoding_iana in cp_exclusion:
185
+ continue
186
+
187
+ if encoding_iana in tested:
188
+ continue
189
+
190
+ tested.add(encoding_iana)
191
+
192
+ decoded_payload: str | None = None
193
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
194
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
+ encoding_iana
196
+ )
197
+
198
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
+ logger.log(
200
+ TRACE,
201
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
+ encoding_iana,
203
+ )
204
+ continue
205
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
+ logger.log(
207
+ TRACE,
208
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
+ encoding_iana,
210
+ )
211
+ continue
212
+
213
+ try:
214
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
+ except (ModuleNotFoundError, ImportError):
216
+ logger.log(
217
+ TRACE,
218
+ "Encoding %s does not provide an IncrementalDecoder",
219
+ encoding_iana,
220
+ )
221
+ continue
222
+
223
+ try:
224
+ if is_too_large_sequence and is_multi_byte_decoder is False:
225
+ str(
226
+ (
227
+ sequences[: int(50e4)]
228
+ if strip_sig_or_bom is False
229
+ else sequences[len(sig_payload) : int(50e4)]
230
+ ),
231
+ encoding=encoding_iana,
232
+ )
233
+ else:
234
+ decoded_payload = str(
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences[len(sig_payload) :]
239
+ ),
240
+ encoding=encoding_iana,
241
+ )
242
+ except (UnicodeDecodeError, LookupError) as e:
243
+ if not isinstance(e, LookupError):
244
+ logger.log(
245
+ TRACE,
246
+ "Code page %s does not fit given bytes sequence at ALL. %s",
247
+ encoding_iana,
248
+ str(e),
249
+ )
250
+ tested_but_hard_failure.append(encoding_iana)
251
+ continue
252
+
253
+ similar_soft_failure_test: bool = False
254
+
255
+ for encoding_soft_failed in tested_but_soft_failure:
256
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
257
+ similar_soft_failure_test = True
258
+ break
259
+
260
+ if similar_soft_failure_test:
261
+ logger.log(
262
+ TRACE,
263
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
+ encoding_iana,
265
+ encoding_soft_failed,
266
+ )
267
+ continue
268
+
269
+ r_ = range(
270
+ 0 if not bom_or_sig_available else len(sig_payload),
271
+ length,
272
+ int(length / steps),
273
+ )
274
+
275
+ multi_byte_bonus: bool = (
276
+ is_multi_byte_decoder
277
+ and decoded_payload is not None
278
+ and len(decoded_payload) < length
279
+ )
280
+
281
+ if multi_byte_bonus:
282
+ logger.log(
283
+ TRACE,
284
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
285
+ "was encoded using n-bytes.",
286
+ encoding_iana,
287
+ )
288
+
289
+ max_chunk_gave_up: int = int(len(r_) / 4)
290
+
291
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
+ early_stop_count: int = 0
293
+ lazy_str_hard_failure = False
294
+
295
+ md_chunks: list[str] = []
296
+ md_ratios = []
297
+
298
+ try:
299
+ for chunk in cut_sequence_chunks(
300
+ sequences,
301
+ encoding_iana,
302
+ r_,
303
+ chunk_size,
304
+ bom_or_sig_available,
305
+ strip_sig_or_bom,
306
+ sig_payload,
307
+ is_multi_byte_decoder,
308
+ decoded_payload,
309
+ ):
310
+ md_chunks.append(chunk)
311
+
312
+ md_ratios.append(
313
+ mess_ratio(
314
+ chunk,
315
+ threshold,
316
+ explain is True and 1 <= len(cp_isolation) <= 2,
317
+ )
318
+ )
319
+
320
+ if md_ratios[-1] >= threshold:
321
+ early_stop_count += 1
322
+
323
+ if (early_stop_count >= max_chunk_gave_up) or (
324
+ bom_or_sig_available and strip_sig_or_bom is False
325
+ ):
326
+ break
327
+ except (
328
+ UnicodeDecodeError
329
+ ) as e: # Lazy str loading may have missed something there
330
+ logger.log(
331
+ TRACE,
332
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
+ encoding_iana,
334
+ str(e),
335
+ )
336
+ early_stop_count = max_chunk_gave_up
337
+ lazy_str_hard_failure = True
338
+
339
+ # We might want to check the sequence again with the whole content
340
+ # Only if initial MD tests passes
341
+ if (
342
+ not lazy_str_hard_failure
343
+ and is_too_large_sequence
344
+ and not is_multi_byte_decoder
345
+ ):
346
+ try:
347
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
+ except UnicodeDecodeError as e:
349
+ logger.log(
350
+ TRACE,
351
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
+ encoding_iana,
353
+ str(e),
354
+ )
355
+ tested_but_hard_failure.append(encoding_iana)
356
+ continue
357
+
358
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
+ tested_but_soft_failure.append(encoding_iana)
361
+ logger.log(
362
+ TRACE,
363
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
+ "Computed mean chaos is %f %%.",
365
+ encoding_iana,
366
+ early_stop_count,
367
+ round(mean_mess_ratio * 100, ndigits=3),
368
+ )
369
+ # Preparing those fallbacks in case we got nothing.
370
+ if (
371
+ enable_fallback
372
+ and encoding_iana
373
+ in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
374
+ and not lazy_str_hard_failure
375
+ ):
376
+ fallback_entry = CharsetMatch(
377
+ sequences,
378
+ encoding_iana,
379
+ threshold,
380
+ bom_or_sig_available,
381
+ [],
382
+ decoded_payload,
383
+ preemptive_declaration=specified_encoding,
384
+ )
385
+ if encoding_iana == specified_encoding:
386
+ fallback_specified = fallback_entry
387
+ elif encoding_iana == "ascii":
388
+ fallback_ascii = fallback_entry
389
+ else:
390
+ fallback_u8 = fallback_entry
391
+ continue
392
+
393
+ logger.log(
394
+ TRACE,
395
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
396
+ encoding_iana,
397
+ round(mean_mess_ratio * 100, ndigits=3),
398
+ )
399
+
400
+ if not is_multi_byte_decoder:
401
+ target_languages: list[str] = encoding_languages(encoding_iana)
402
+ else:
403
+ target_languages = mb_encoding_languages(encoding_iana)
404
+
405
+ if target_languages:
406
+ logger.log(
407
+ TRACE,
408
+ "{} should target any language(s) of {}".format(
409
+ encoding_iana, str(target_languages)
410
+ ),
411
+ )
412
+
413
+ cd_ratios = []
414
+
415
+ # We shall skip the CD when its about ASCII
416
+ # Most of the time its not relevant to run "language-detection" on it.
417
+ if encoding_iana != "ascii":
418
+ for chunk in md_chunks:
419
+ chunk_languages = coherence_ratio(
420
+ chunk,
421
+ language_threshold,
422
+ ",".join(target_languages) if target_languages else None,
423
+ )
424
+
425
+ cd_ratios.append(chunk_languages)
426
+
427
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
428
+
429
+ if cd_ratios_merged:
430
+ logger.log(
431
+ TRACE,
432
+ "We detected language {} using {}".format(
433
+ cd_ratios_merged, encoding_iana
434
+ ),
435
+ )
436
+
437
+ current_match = CharsetMatch(
438
+ sequences,
439
+ encoding_iana,
440
+ mean_mess_ratio,
441
+ bom_or_sig_available,
442
+ cd_ratios_merged,
443
+ (
444
+ decoded_payload
445
+ if (
446
+ is_too_large_sequence is False
447
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
448
+ )
449
+ else None
450
+ ),
451
+ preemptive_declaration=specified_encoding,
452
+ )
453
+
454
+ results.append(current_match)
455
+
456
+ if (
457
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
458
+ and mean_mess_ratio < 0.1
459
+ ):
460
+ # If md says nothing to worry about, then... stop immediately!
461
+ if mean_mess_ratio == 0.0:
462
+ logger.debug(
463
+ "Encoding detection: %s is most likely the one.",
464
+ current_match.encoding,
465
+ )
466
+ if explain: # Defensive: ensure exit path clean handler
467
+ logger.removeHandler(explain_handler)
468
+ logger.setLevel(previous_logger_level)
469
+ return CharsetMatches([current_match])
470
+
471
+ early_stop_results.append(current_match)
472
+
473
+ if (
474
+ len(early_stop_results)
475
+ and (specified_encoding is None or specified_encoding in tested)
476
+ and "ascii" in tested
477
+ and "utf_8" in tested
478
+ ):
479
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
480
+ logger.debug(
481
+ "Encoding detection: %s is most likely the one.",
482
+ probable_result.encoding,
483
+ )
484
+ if explain: # Defensive: ensure exit path clean handler
485
+ logger.removeHandler(explain_handler)
486
+ logger.setLevel(previous_logger_level)
487
+
488
+ return CharsetMatches([probable_result])
489
+
490
+ if encoding_iana == sig_encoding:
491
+ logger.debug(
492
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
493
+ "the beginning of the sequence.",
494
+ encoding_iana,
495
+ )
496
+ if explain: # Defensive: ensure exit path clean handler
497
+ logger.removeHandler(explain_handler)
498
+ logger.setLevel(previous_logger_level)
499
+ return CharsetMatches([results[encoding_iana]])
500
+
501
+ if len(results) == 0:
502
+ if fallback_u8 or fallback_ascii or fallback_specified:
503
+ logger.log(
504
+ TRACE,
505
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
506
+ )
507
+
508
+ if fallback_specified:
509
+ logger.debug(
510
+ "Encoding detection: %s will be used as a fallback match",
511
+ fallback_specified.encoding,
512
+ )
513
+ results.append(fallback_specified)
514
+ elif (
515
+ (fallback_u8 and fallback_ascii is None)
516
+ or (
517
+ fallback_u8
518
+ and fallback_ascii
519
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
520
+ )
521
+ or (fallback_u8 is not None)
522
+ ):
523
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
524
+ results.append(fallback_u8)
525
+ elif fallback_ascii:
526
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
527
+ results.append(fallback_ascii)
528
+
529
+ if results:
530
+ logger.debug(
531
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
532
+ results.best().encoding, # type: ignore
533
+ len(results) - 1,
534
+ )
535
+ else:
536
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
537
+
538
+ if explain:
539
+ logger.removeHandler(explain_handler)
540
+ logger.setLevel(previous_logger_level)
541
+
542
+ return results
543
+
544
+
545
+ def from_fp(
546
+ fp: BinaryIO,
547
+ steps: int = 5,
548
+ chunk_size: int = 512,
549
+ threshold: float = 0.20,
550
+ cp_isolation: list[str] | None = None,
551
+ cp_exclusion: list[str] | None = None,
552
+ preemptive_behaviour: bool = True,
553
+ explain: bool = False,
554
+ language_threshold: float = 0.1,
555
+ enable_fallback: bool = True,
556
+ ) -> CharsetMatches:
557
+ """
558
+ Same thing than the function from_bytes but using a file pointer that is already ready.
559
+ Will not close the file pointer.
560
+ """
561
+ return from_bytes(
562
+ fp.read(),
563
+ steps,
564
+ chunk_size,
565
+ threshold,
566
+ cp_isolation,
567
+ cp_exclusion,
568
+ preemptive_behaviour,
569
+ explain,
570
+ language_threshold,
571
+ enable_fallback,
572
+ )
573
+
574
+
575
+ def from_path(
576
+ path: str | bytes | PathLike, # type: ignore[type-arg]
577
+ steps: int = 5,
578
+ chunk_size: int = 512,
579
+ threshold: float = 0.20,
580
+ cp_isolation: list[str] | None = None,
581
+ cp_exclusion: list[str] | None = None,
582
+ preemptive_behaviour: bool = True,
583
+ explain: bool = False,
584
+ language_threshold: float = 0.1,
585
+ enable_fallback: bool = True,
586
+ ) -> CharsetMatches:
587
+ """
588
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
589
+ Can raise IOError.
590
+ """
591
+ with open(path, "rb") as fp:
592
+ return from_fp(
593
+ fp,
594
+ steps,
595
+ chunk_size,
596
+ threshold,
597
+ cp_isolation,
598
+ cp_exclusion,
599
+ preemptive_behaviour,
600
+ explain,
601
+ language_threshold,
602
+ enable_fallback,
603
+ )
604
+
605
+
606
+ def is_binary(
607
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
608
+ steps: int = 5,
609
+ chunk_size: int = 512,
610
+ threshold: float = 0.20,
611
+ cp_isolation: list[str] | None = None,
612
+ cp_exclusion: list[str] | None = None,
613
+ preemptive_behaviour: bool = True,
614
+ explain: bool = False,
615
+ language_threshold: float = 0.1,
616
+ enable_fallback: bool = False,
617
+ ) -> bool:
618
+ """
619
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
620
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
621
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
622
+ """
623
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
624
+ guesses = from_path(
625
+ fp_or_path_or_payload,
626
+ steps=steps,
627
+ chunk_size=chunk_size,
628
+ threshold=threshold,
629
+ cp_isolation=cp_isolation,
630
+ cp_exclusion=cp_exclusion,
631
+ preemptive_behaviour=preemptive_behaviour,
632
+ explain=explain,
633
+ language_threshold=language_threshold,
634
+ enable_fallback=enable_fallback,
635
+ )
636
+ elif isinstance(
637
+ fp_or_path_or_payload,
638
+ (
639
+ bytes,
640
+ bytearray,
641
+ ),
642
+ ):
643
+ guesses = from_bytes(
644
+ fp_or_path_or_payload,
645
+ steps=steps,
646
+ chunk_size=chunk_size,
647
+ threshold=threshold,
648
+ cp_isolation=cp_isolation,
649
+ cp_exclusion=cp_exclusion,
650
+ preemptive_behaviour=preemptive_behaviour,
651
+ explain=explain,
652
+ language_threshold=language_threshold,
653
+ enable_fallback=enable_fallback,
654
+ )
655
+ else:
656
+ guesses = from_fp(
657
+ fp_or_path_or_payload,
658
+ steps=steps,
659
+ chunk_size=chunk_size,
660
+ threshold=threshold,
661
+ cp_isolation=cp_isolation,
662
+ cp_exclusion=cp_exclusion,
663
+ preemptive_behaviour=preemptive_behaviour,
664
+ explain=explain,
665
+ language_threshold=language_threshold,
666
+ enable_fallback=enable_fallback,
667
+ )
668
+
669
+ return not guesses
venv/lib/python3.13/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ )
16
+ from .md import is_suspiciously_successive_range
17
+ from .models import CoherenceMatches
18
+ from .utils import (
19
+ is_accentuated,
20
+ is_latin,
21
+ is_multi_byte_encoding,
22
+ is_unicode_range_secondary,
23
+ unicode_range,
24
+ )
25
+
26
+
27
+ def encoding_unicode_range(iana_name: str) -> list[str]:
28
+ """
29
+ Return associated unicode ranges in a single byte code page.
30
+ """
31
+ if is_multi_byte_encoding(iana_name):
32
+ raise OSError("Function not supported on multi-byte code page")
33
+
34
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: str | None = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> list[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: list[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> list[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
+ primary_range: str | None = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> list[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: list[str], ignore_non_latin: bool = False
139
+ ) -> list[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: list[tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: list[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError(f"{language} not available")
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: list[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: list[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: list[str] = ordered_characters[0:character_rank]
224
+ characters_after: list[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: str | None = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: str | None = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: dict[str, list[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: dict[str, list[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: list[tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: list[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
venv/lib/python3.13/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,2015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
+ from encodings.aliases import aliases
5
+ from re import IGNORECASE
6
+ from re import compile as re_compile
7
+
8
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
+ ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
+ "utf_8": BOM_UTF8,
11
+ "utf_7": [
12
+ b"\x2b\x2f\x76\x38",
13
+ b"\x2b\x2f\x76\x39",
14
+ b"\x2b\x2f\x76\x2b",
15
+ b"\x2b\x2f\x76\x2f",
16
+ b"\x2b\x2f\x76\x38\x2d",
17
+ ],
18
+ "gb18030": b"\x84\x31\x95\x33",
19
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
+ }
22
+
23
+ TOO_SMALL_SEQUENCE: int = 32
24
+ TOO_BIG_SEQUENCE: int = int(10e6)
25
+
26
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
+
28
+ # Up-to-date Unicode ucd/15.0.0
29
+ UNICODE_RANGES_COMBINED: dict[str, range] = {
30
+ "Control character": range(32),
31
+ "Basic Latin": range(32, 128),
32
+ "Latin-1 Supplement": range(128, 256),
33
+ "Latin Extended-A": range(256, 384),
34
+ "Latin Extended-B": range(384, 592),
35
+ "IPA Extensions": range(592, 688),
36
+ "Spacing Modifier Letters": range(688, 768),
37
+ "Combining Diacritical Marks": range(768, 880),
38
+ "Greek and Coptic": range(880, 1024),
39
+ "Cyrillic": range(1024, 1280),
40
+ "Cyrillic Supplement": range(1280, 1328),
41
+ "Armenian": range(1328, 1424),
42
+ "Hebrew": range(1424, 1536),
43
+ "Arabic": range(1536, 1792),
44
+ "Syriac": range(1792, 1872),
45
+ "Arabic Supplement": range(1872, 1920),
46
+ "Thaana": range(1920, 1984),
47
+ "NKo": range(1984, 2048),
48
+ "Samaritan": range(2048, 2112),
49
+ "Mandaic": range(2112, 2144),
50
+ "Syriac Supplement": range(2144, 2160),
51
+ "Arabic Extended-B": range(2160, 2208),
52
+ "Arabic Extended-A": range(2208, 2304),
53
+ "Devanagari": range(2304, 2432),
54
+ "Bengali": range(2432, 2560),
55
+ "Gurmukhi": range(2560, 2688),
56
+ "Gujarati": range(2688, 2816),
57
+ "Oriya": range(2816, 2944),
58
+ "Tamil": range(2944, 3072),
59
+ "Telugu": range(3072, 3200),
60
+ "Kannada": range(3200, 3328),
61
+ "Malayalam": range(3328, 3456),
62
+ "Sinhala": range(3456, 3584),
63
+ "Thai": range(3584, 3712),
64
+ "Lao": range(3712, 3840),
65
+ "Tibetan": range(3840, 4096),
66
+ "Myanmar": range(4096, 4256),
67
+ "Georgian": range(4256, 4352),
68
+ "Hangul Jamo": range(4352, 4608),
69
+ "Ethiopic": range(4608, 4992),
70
+ "Ethiopic Supplement": range(4992, 5024),
71
+ "Cherokee": range(5024, 5120),
72
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
+ "Ogham": range(5760, 5792),
74
+ "Runic": range(5792, 5888),
75
+ "Tagalog": range(5888, 5920),
76
+ "Hanunoo": range(5920, 5952),
77
+ "Buhid": range(5952, 5984),
78
+ "Tagbanwa": range(5984, 6016),
79
+ "Khmer": range(6016, 6144),
80
+ "Mongolian": range(6144, 6320),
81
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
+ "Limbu": range(6400, 6480),
83
+ "Tai Le": range(6480, 6528),
84
+ "New Tai Lue": range(6528, 6624),
85
+ "Khmer Symbols": range(6624, 6656),
86
+ "Buginese": range(6656, 6688),
87
+ "Tai Tham": range(6688, 6832),
88
+ "Combining Diacritical Marks Extended": range(6832, 6912),
89
+ "Balinese": range(6912, 7040),
90
+ "Sundanese": range(7040, 7104),
91
+ "Batak": range(7104, 7168),
92
+ "Lepcha": range(7168, 7248),
93
+ "Ol Chiki": range(7248, 7296),
94
+ "Cyrillic Extended-C": range(7296, 7312),
95
+ "Georgian Extended": range(7312, 7360),
96
+ "Sundanese Supplement": range(7360, 7376),
97
+ "Vedic Extensions": range(7376, 7424),
98
+ "Phonetic Extensions": range(7424, 7552),
99
+ "Phonetic Extensions Supplement": range(7552, 7616),
100
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
101
+ "Latin Extended Additional": range(7680, 7936),
102
+ "Greek Extended": range(7936, 8192),
103
+ "General Punctuation": range(8192, 8304),
104
+ "Superscripts and Subscripts": range(8304, 8352),
105
+ "Currency Symbols": range(8352, 8400),
106
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
+ "Letterlike Symbols": range(8448, 8528),
108
+ "Number Forms": range(8528, 8592),
109
+ "Arrows": range(8592, 8704),
110
+ "Mathematical Operators": range(8704, 8960),
111
+ "Miscellaneous Technical": range(8960, 9216),
112
+ "Control Pictures": range(9216, 9280),
113
+ "Optical Character Recognition": range(9280, 9312),
114
+ "Enclosed Alphanumerics": range(9312, 9472),
115
+ "Box Drawing": range(9472, 9600),
116
+ "Block Elements": range(9600, 9632),
117
+ "Geometric Shapes": range(9632, 9728),
118
+ "Miscellaneous Symbols": range(9728, 9984),
119
+ "Dingbats": range(9984, 10176),
120
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
+ "Supplemental Arrows-A": range(10224, 10240),
122
+ "Braille Patterns": range(10240, 10496),
123
+ "Supplemental Arrows-B": range(10496, 10624),
124
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
+ "Supplemental Mathematical Operators": range(10752, 11008),
126
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
+ "Glagolitic": range(11264, 11360),
128
+ "Latin Extended-C": range(11360, 11392),
129
+ "Coptic": range(11392, 11520),
130
+ "Georgian Supplement": range(11520, 11568),
131
+ "Tifinagh": range(11568, 11648),
132
+ "Ethiopic Extended": range(11648, 11744),
133
+ "Cyrillic Extended-A": range(11744, 11776),
134
+ "Supplemental Punctuation": range(11776, 11904),
135
+ "CJK Radicals Supplement": range(11904, 12032),
136
+ "Kangxi Radicals": range(12032, 12256),
137
+ "Ideographic Description Characters": range(12272, 12288),
138
+ "CJK Symbols and Punctuation": range(12288, 12352),
139
+ "Hiragana": range(12352, 12448),
140
+ "Katakana": range(12448, 12544),
141
+ "Bopomofo": range(12544, 12592),
142
+ "Hangul Compatibility Jamo": range(12592, 12688),
143
+ "Kanbun": range(12688, 12704),
144
+ "Bopomofo Extended": range(12704, 12736),
145
+ "CJK Strokes": range(12736, 12784),
146
+ "Katakana Phonetic Extensions": range(12784, 12800),
147
+ "Enclosed CJK Letters and Months": range(12800, 13056),
148
+ "CJK Compatibility": range(13056, 13312),
149
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
150
+ "Yijing Hexagram Symbols": range(19904, 19968),
151
+ "CJK Unified Ideographs": range(19968, 40960),
152
+ "Yi Syllables": range(40960, 42128),
153
+ "Yi Radicals": range(42128, 42192),
154
+ "Lisu": range(42192, 42240),
155
+ "Vai": range(42240, 42560),
156
+ "Cyrillic Extended-B": range(42560, 42656),
157
+ "Bamum": range(42656, 42752),
158
+ "Modifier Tone Letters": range(42752, 42784),
159
+ "Latin Extended-D": range(42784, 43008),
160
+ "Syloti Nagri": range(43008, 43056),
161
+ "Common Indic Number Forms": range(43056, 43072),
162
+ "Phags-pa": range(43072, 43136),
163
+ "Saurashtra": range(43136, 43232),
164
+ "Devanagari Extended": range(43232, 43264),
165
+ "Kayah Li": range(43264, 43312),
166
+ "Rejang": range(43312, 43360),
167
+ "Hangul Jamo Extended-A": range(43360, 43392),
168
+ "Javanese": range(43392, 43488),
169
+ "Myanmar Extended-B": range(43488, 43520),
170
+ "Cham": range(43520, 43616),
171
+ "Myanmar Extended-A": range(43616, 43648),
172
+ "Tai Viet": range(43648, 43744),
173
+ "Meetei Mayek Extensions": range(43744, 43776),
174
+ "Ethiopic Extended-A": range(43776, 43824),
175
+ "Latin Extended-E": range(43824, 43888),
176
+ "Cherokee Supplement": range(43888, 43968),
177
+ "Meetei Mayek": range(43968, 44032),
178
+ "Hangul Syllables": range(44032, 55216),
179
+ "Hangul Jamo Extended-B": range(55216, 55296),
180
+ "High Surrogates": range(55296, 56192),
181
+ "High Private Use Surrogates": range(56192, 56320),
182
+ "Low Surrogates": range(56320, 57344),
183
+ "Private Use Area": range(57344, 63744),
184
+ "CJK Compatibility Ideographs": range(63744, 64256),
185
+ "Alphabetic Presentation Forms": range(64256, 64336),
186
+ "Arabic Presentation Forms-A": range(64336, 65024),
187
+ "Variation Selectors": range(65024, 65040),
188
+ "Vertical Forms": range(65040, 65056),
189
+ "Combining Half Marks": range(65056, 65072),
190
+ "CJK Compatibility Forms": range(65072, 65104),
191
+ "Small Form Variants": range(65104, 65136),
192
+ "Arabic Presentation Forms-B": range(65136, 65280),
193
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
+ "Specials": range(65520, 65536),
195
+ "Linear B Syllabary": range(65536, 65664),
196
+ "Linear B Ideograms": range(65664, 65792),
197
+ "Aegean Numbers": range(65792, 65856),
198
+ "Ancient Greek Numbers": range(65856, 65936),
199
+ "Ancient Symbols": range(65936, 66000),
200
+ "Phaistos Disc": range(66000, 66048),
201
+ "Lycian": range(66176, 66208),
202
+ "Carian": range(66208, 66272),
203
+ "Coptic Epact Numbers": range(66272, 66304),
204
+ "Old Italic": range(66304, 66352),
205
+ "Gothic": range(66352, 66384),
206
+ "Old Permic": range(66384, 66432),
207
+ "Ugaritic": range(66432, 66464),
208
+ "Old Persian": range(66464, 66528),
209
+ "Deseret": range(66560, 66640),
210
+ "Shavian": range(66640, 66688),
211
+ "Osmanya": range(66688, 66736),
212
+ "Osage": range(66736, 66816),
213
+ "Elbasan": range(66816, 66864),
214
+ "Caucasian Albanian": range(66864, 66928),
215
+ "Vithkuqi": range(66928, 67008),
216
+ "Linear A": range(67072, 67456),
217
+ "Latin Extended-F": range(67456, 67520),
218
+ "Cypriot Syllabary": range(67584, 67648),
219
+ "Imperial Aramaic": range(67648, 67680),
220
+ "Palmyrene": range(67680, 67712),
221
+ "Nabataean": range(67712, 67760),
222
+ "Hatran": range(67808, 67840),
223
+ "Phoenician": range(67840, 67872),
224
+ "Lydian": range(67872, 67904),
225
+ "Meroitic Hieroglyphs": range(67968, 68000),
226
+ "Meroitic Cursive": range(68000, 68096),
227
+ "Kharoshthi": range(68096, 68192),
228
+ "Old South Arabian": range(68192, 68224),
229
+ "Old North Arabian": range(68224, 68256),
230
+ "Manichaean": range(68288, 68352),
231
+ "Avestan": range(68352, 68416),
232
+ "Inscriptional Parthian": range(68416, 68448),
233
+ "Inscriptional Pahlavi": range(68448, 68480),
234
+ "Psalter Pahlavi": range(68480, 68528),
235
+ "Old Turkic": range(68608, 68688),
236
+ "Old Hungarian": range(68736, 68864),
237
+ "Hanifi Rohingya": range(68864, 68928),
238
+ "Rumi Numeral Symbols": range(69216, 69248),
239
+ "Yezidi": range(69248, 69312),
240
+ "Arabic Extended-C": range(69312, 69376),
241
+ "Old Sogdian": range(69376, 69424),
242
+ "Sogdian": range(69424, 69488),
243
+ "Old Uyghur": range(69488, 69552),
244
+ "Chorasmian": range(69552, 69600),
245
+ "Elymaic": range(69600, 69632),
246
+ "Brahmi": range(69632, 69760),
247
+ "Kaithi": range(69760, 69840),
248
+ "Sora Sompeng": range(69840, 69888),
249
+ "Chakma": range(69888, 69968),
250
+ "Mahajani": range(69968, 70016),
251
+ "Sharada": range(70016, 70112),
252
+ "Sinhala Archaic Numbers": range(70112, 70144),
253
+ "Khojki": range(70144, 70224),
254
+ "Multani": range(70272, 70320),
255
+ "Khudawadi": range(70320, 70400),
256
+ "Grantha": range(70400, 70528),
257
+ "Newa": range(70656, 70784),
258
+ "Tirhuta": range(70784, 70880),
259
+ "Siddham": range(71040, 71168),
260
+ "Modi": range(71168, 71264),
261
+ "Mongolian Supplement": range(71264, 71296),
262
+ "Takri": range(71296, 71376),
263
+ "Ahom": range(71424, 71504),
264
+ "Dogra": range(71680, 71760),
265
+ "Warang Citi": range(71840, 71936),
266
+ "Dives Akuru": range(71936, 72032),
267
+ "Nandinagari": range(72096, 72192),
268
+ "Zanabazar Square": range(72192, 72272),
269
+ "Soyombo": range(72272, 72368),
270
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
271
+ "Pau Cin Hau": range(72384, 72448),
272
+ "Devanagari Extended-A": range(72448, 72544),
273
+ "Bhaiksuki": range(72704, 72816),
274
+ "Marchen": range(72816, 72896),
275
+ "Masaram Gondi": range(72960, 73056),
276
+ "Gunjala Gondi": range(73056, 73136),
277
+ "Makasar": range(73440, 73472),
278
+ "Kawi": range(73472, 73568),
279
+ "Lisu Supplement": range(73648, 73664),
280
+ "Tamil Supplement": range(73664, 73728),
281
+ "Cuneiform": range(73728, 74752),
282
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
283
+ "Early Dynastic Cuneiform": range(74880, 75088),
284
+ "Cypro-Minoan": range(77712, 77824),
285
+ "Egyptian Hieroglyphs": range(77824, 78896),
286
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
287
+ "Anatolian Hieroglyphs": range(82944, 83584),
288
+ "Bamum Supplement": range(92160, 92736),
289
+ "Mro": range(92736, 92784),
290
+ "Tangsa": range(92784, 92880),
291
+ "Bassa Vah": range(92880, 92928),
292
+ "Pahawh Hmong": range(92928, 93072),
293
+ "Medefaidrin": range(93760, 93856),
294
+ "Miao": range(93952, 94112),
295
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
296
+ "Tangut": range(94208, 100352),
297
+ "Tangut Components": range(100352, 101120),
298
+ "Khitan Small Script": range(101120, 101632),
299
+ "Tangut Supplement": range(101632, 101760),
300
+ "Kana Extended-B": range(110576, 110592),
301
+ "Kana Supplement": range(110592, 110848),
302
+ "Kana Extended-A": range(110848, 110896),
303
+ "Small Kana Extension": range(110896, 110960),
304
+ "Nushu": range(110960, 111360),
305
+ "Duployan": range(113664, 113824),
306
+ "Shorthand Format Controls": range(113824, 113840),
307
+ "Znamenny Musical Notation": range(118528, 118736),
308
+ "Byzantine Musical Symbols": range(118784, 119040),
309
+ "Musical Symbols": range(119040, 119296),
310
+ "Ancient Greek Musical Notation": range(119296, 119376),
311
+ "Kaktovik Numerals": range(119488, 119520),
312
+ "Mayan Numerals": range(119520, 119552),
313
+ "Tai Xuan Jing Symbols": range(119552, 119648),
314
+ "Counting Rod Numerals": range(119648, 119680),
315
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
316
+ "Sutton SignWriting": range(120832, 121520),
317
+ "Latin Extended-G": range(122624, 122880),
318
+ "Glagolitic Supplement": range(122880, 122928),
319
+ "Cyrillic Extended-D": range(122928, 123024),
320
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
321
+ "Toto": range(123536, 123584),
322
+ "Wancho": range(123584, 123648),
323
+ "Nag Mundari": range(124112, 124160),
324
+ "Ethiopic Extended-B": range(124896, 124928),
325
+ "Mende Kikakui": range(124928, 125152),
326
+ "Adlam": range(125184, 125280),
327
+ "Indic Siyaq Numbers": range(126064, 126144),
328
+ "Ottoman Siyaq Numbers": range(126208, 126288),
329
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
330
+ "Mahjong Tiles": range(126976, 127024),
331
+ "Domino Tiles": range(127024, 127136),
332
+ "Playing Cards": range(127136, 127232),
333
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
334
+ "Enclosed Ideographic Supplement": range(127488, 127744),
335
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
336
+ "Emoticons range(Emoji)": range(128512, 128592),
337
+ "Ornamental Dingbats": range(128592, 128640),
338
+ "Transport and Map Symbols": range(128640, 128768),
339
+ "Alchemical Symbols": range(128768, 128896),
340
+ "Geometric Shapes Extended": range(128896, 129024),
341
+ "Supplemental Arrows-C": range(129024, 129280),
342
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
343
+ "Chess Symbols": range(129536, 129648),
344
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
345
+ "Symbols for Legacy Computing": range(129792, 130048),
346
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
347
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
348
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
349
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
350
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
351
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
352
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
353
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
354
+ "Tags": range(917504, 917632),
355
+ "Variation Selectors Supplement": range(917760, 918000),
356
+ "Supplementary Private Use Area-A": range(983040, 1048576),
357
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
358
+ }
359
+
360
+
361
+ UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
362
+ "Supplement",
363
+ "Extended",
364
+ "Extensions",
365
+ "Modifier",
366
+ "Marks",
367
+ "Punctuation",
368
+ "Symbols",
369
+ "Forms",
370
+ "Operators",
371
+ "Miscellaneous",
372
+ "Drawing",
373
+ "Block",
374
+ "Shapes",
375
+ "Supplemental",
376
+ "Tags",
377
+ ]
378
+
379
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
380
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
381
+ IGNORECASE,
382
+ )
383
+
384
+ IANA_NO_ALIASES = [
385
+ "cp720",
386
+ "cp737",
387
+ "cp856",
388
+ "cp874",
389
+ "cp875",
390
+ "cp1006",
391
+ "koi8_r",
392
+ "koi8_t",
393
+ "koi8_u",
394
+ ]
395
+
396
+ IANA_SUPPORTED: list[str] = sorted(
397
+ filter(
398
+ lambda x: x.endswith("_codec") is False
399
+ and x not in {"rot_13", "tactis", "mbcs"},
400
+ list(set(aliases.values())) + IANA_NO_ALIASES,
401
+ )
402
+ )
403
+
404
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
405
+
406
+ # pre-computed code page that are similar using the function cp_similarity.
407
+ IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
408
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
409
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
410
+ "cp1125": ["cp866"],
411
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
412
+ "cp1250": ["iso8859_2"],
413
+ "cp1251": ["kz1048", "ptcp154"],
414
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
415
+ "cp1253": ["iso8859_7"],
416
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
417
+ "cp1257": ["iso8859_13"],
418
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
419
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
420
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
421
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
422
+ "cp857": ["cp850", "cp858", "cp865"],
423
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
424
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
425
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
426
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
427
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
428
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
429
+ "cp866": ["cp1125"],
430
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
431
+ "iso8859_11": ["tis_620"],
432
+ "iso8859_13": ["cp1257"],
433
+ "iso8859_14": [
434
+ "iso8859_10",
435
+ "iso8859_15",
436
+ "iso8859_16",
437
+ "iso8859_3",
438
+ "iso8859_9",
439
+ "latin_1",
440
+ ],
441
+ "iso8859_15": [
442
+ "cp1252",
443
+ "cp1254",
444
+ "iso8859_10",
445
+ "iso8859_14",
446
+ "iso8859_16",
447
+ "iso8859_3",
448
+ "iso8859_9",
449
+ "latin_1",
450
+ ],
451
+ "iso8859_16": [
452
+ "iso8859_14",
453
+ "iso8859_15",
454
+ "iso8859_2",
455
+ "iso8859_3",
456
+ "iso8859_9",
457
+ "latin_1",
458
+ ],
459
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
460
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
461
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
462
+ "iso8859_7": ["cp1253"],
463
+ "iso8859_9": [
464
+ "cp1252",
465
+ "cp1254",
466
+ "cp1258",
467
+ "iso8859_10",
468
+ "iso8859_14",
469
+ "iso8859_15",
470
+ "iso8859_16",
471
+ "iso8859_3",
472
+ "iso8859_4",
473
+ "latin_1",
474
+ ],
475
+ "kz1048": ["cp1251", "ptcp154"],
476
+ "latin_1": [
477
+ "cp1252",
478
+ "cp1254",
479
+ "cp1258",
480
+ "iso8859_10",
481
+ "iso8859_14",
482
+ "iso8859_15",
483
+ "iso8859_16",
484
+ "iso8859_3",
485
+ "iso8859_4",
486
+ "iso8859_9",
487
+ ],
488
+ "mac_iceland": ["mac_roman", "mac_turkish"],
489
+ "mac_roman": ["mac_iceland", "mac_turkish"],
490
+ "mac_turkish": ["mac_iceland", "mac_roman"],
491
+ "ptcp154": ["cp1251", "kz1048"],
492
+ "tis_620": ["iso8859_11"],
493
+ }
494
+
495
+
496
+ CHARDET_CORRESPONDENCE: dict[str, str] = {
497
+ "iso2022_kr": "ISO-2022-KR",
498
+ "iso2022_jp": "ISO-2022-JP",
499
+ "euc_kr": "EUC-KR",
500
+ "tis_620": "TIS-620",
501
+ "utf_32": "UTF-32",
502
+ "euc_jp": "EUC-JP",
503
+ "koi8_r": "KOI8-R",
504
+ "iso8859_1": "ISO-8859-1",
505
+ "iso8859_2": "ISO-8859-2",
506
+ "iso8859_5": "ISO-8859-5",
507
+ "iso8859_6": "ISO-8859-6",
508
+ "iso8859_7": "ISO-8859-7",
509
+ "iso8859_8": "ISO-8859-8",
510
+ "utf_16": "UTF-16",
511
+ "cp855": "IBM855",
512
+ "mac_cyrillic": "MacCyrillic",
513
+ "gb2312": "GB2312",
514
+ "gb18030": "GB18030",
515
+ "cp932": "CP932",
516
+ "cp866": "IBM866",
517
+ "utf_8": "utf-8",
518
+ "utf_8_sig": "UTF-8-SIG",
519
+ "shift_jis": "SHIFT_JIS",
520
+ "big5": "Big5",
521
+ "cp1250": "windows-1250",
522
+ "cp1251": "windows-1251",
523
+ "cp1252": "Windows-1252",
524
+ "cp1253": "windows-1253",
525
+ "cp1255": "windows-1255",
526
+ "cp1256": "windows-1256",
527
+ "cp1254": "Windows-1254",
528
+ "cp949": "CP949",
529
+ }
530
+
531
+
532
+ COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
533
+ "<",
534
+ ">",
535
+ "=",
536
+ ":",
537
+ "/",
538
+ "&",
539
+ ";",
540
+ "{",
541
+ "}",
542
+ "[",
543
+ "]",
544
+ ",",
545
+ "|",
546
+ '"',
547
+ "-",
548
+ "(",
549
+ ")",
550
+ }
551
+
552
+ # Sample character sets — replace with full lists if needed
553
+ COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
554
+
555
+ COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
556
+
557
+ COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
558
+
559
+ # Combine all into a set
560
+ COMMON_CJK_CHARACTERS = set(
561
+ "".join(
562
+ [
563
+ COMMON_CHINESE_CHARACTERS,
564
+ COMMON_JAPANESE_CHARACTERS,
565
+ COMMON_KOREAN_CHARACTERS,
566
+ ]
567
+ )
568
+ )
569
+
570
+ KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
571
+ ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
572
+
573
+ # Logging LEVEL below DEBUG
574
+ TRACE: int = 5
575
+
576
+
577
+ # Language label that contain the em dash "—"
578
+ # character are to be considered alternative seq to origin
579
+ FREQUENCIES: dict[str, list[str]] = {
580
+ "English": [
581
+ "e",
582
+ "a",
583
+ "t",
584
+ "i",
585
+ "o",
586
+ "n",
587
+ "s",
588
+ "r",
589
+ "h",
590
+ "l",
591
+ "d",
592
+ "c",
593
+ "u",
594
+ "m",
595
+ "f",
596
+ "p",
597
+ "g",
598
+ "w",
599
+ "y",
600
+ "b",
601
+ "v",
602
+ "k",
603
+ "x",
604
+ "j",
605
+ "z",
606
+ "q",
607
+ ],
608
+ "English—": [
609
+ "e",
610
+ "a",
611
+ "t",
612
+ "i",
613
+ "o",
614
+ "n",
615
+ "s",
616
+ "r",
617
+ "h",
618
+ "l",
619
+ "d",
620
+ "c",
621
+ "m",
622
+ "u",
623
+ "f",
624
+ "p",
625
+ "g",
626
+ "w",
627
+ "b",
628
+ "y",
629
+ "v",
630
+ "k",
631
+ "j",
632
+ "x",
633
+ "z",
634
+ "q",
635
+ ],
636
+ "German": [
637
+ "e",
638
+ "n",
639
+ "i",
640
+ "r",
641
+ "s",
642
+ "t",
643
+ "a",
644
+ "d",
645
+ "h",
646
+ "u",
647
+ "l",
648
+ "g",
649
+ "o",
650
+ "c",
651
+ "m",
652
+ "b",
653
+ "f",
654
+ "k",
655
+ "w",
656
+ "z",
657
+ "p",
658
+ "v",
659
+ "ü",
660
+ "ä",
661
+ "ö",
662
+ "j",
663
+ ],
664
+ "French": [
665
+ "e",
666
+ "a",
667
+ "s",
668
+ "n",
669
+ "i",
670
+ "t",
671
+ "r",
672
+ "l",
673
+ "u",
674
+ "o",
675
+ "d",
676
+ "c",
677
+ "p",
678
+ "m",
679
+ "é",
680
+ "v",
681
+ "g",
682
+ "f",
683
+ "b",
684
+ "h",
685
+ "q",
686
+ "à",
687
+ "x",
688
+ "è",
689
+ "y",
690
+ "j",
691
+ ],
692
+ "Dutch": [
693
+ "e",
694
+ "n",
695
+ "a",
696
+ "i",
697
+ "r",
698
+ "t",
699
+ "o",
700
+ "d",
701
+ "s",
702
+ "l",
703
+ "g",
704
+ "h",
705
+ "v",
706
+ "m",
707
+ "u",
708
+ "k",
709
+ "c",
710
+ "p",
711
+ "b",
712
+ "w",
713
+ "j",
714
+ "z",
715
+ "f",
716
+ "y",
717
+ "x",
718
+ "ë",
719
+ ],
720
+ "Italian": [
721
+ "e",
722
+ "i",
723
+ "a",
724
+ "o",
725
+ "n",
726
+ "l",
727
+ "t",
728
+ "r",
729
+ "s",
730
+ "c",
731
+ "d",
732
+ "u",
733
+ "p",
734
+ "m",
735
+ "g",
736
+ "v",
737
+ "f",
738
+ "b",
739
+ "z",
740
+ "h",
741
+ "q",
742
+ "è",
743
+ "à",
744
+ "k",
745
+ "y",
746
+ "ò",
747
+ ],
748
+ "Polish": [
749
+ "a",
750
+ "i",
751
+ "o",
752
+ "e",
753
+ "n",
754
+ "r",
755
+ "z",
756
+ "w",
757
+ "s",
758
+ "c",
759
+ "t",
760
+ "k",
761
+ "y",
762
+ "d",
763
+ "p",
764
+ "m",
765
+ "u",
766
+ "l",
767
+ "j",
768
+ "ł",
769
+ "g",
770
+ "b",
771
+ "h",
772
+ "ą",
773
+ "ę",
774
+ "ó",
775
+ ],
776
+ "Spanish": [
777
+ "e",
778
+ "a",
779
+ "o",
780
+ "n",
781
+ "s",
782
+ "r",
783
+ "i",
784
+ "l",
785
+ "d",
786
+ "t",
787
+ "c",
788
+ "u",
789
+ "m",
790
+ "p",
791
+ "b",
792
+ "g",
793
+ "v",
794
+ "f",
795
+ "y",
796
+ "ó",
797
+ "h",
798
+ "q",
799
+ "í",
800
+ "j",
801
+ "z",
802
+ "á",
803
+ ],
804
+ "Russian": [
805
+ "о",
806
+ "а",
807
+ "е",
808
+ "и",
809
+ "н",
810
+ "с",
811
+ "т",
812
+ "р",
813
+ "в",
814
+ "л",
815
+ "к",
816
+ "м",
817
+ "д",
818
+ "п",
819
+ "у",
820
+ "г",
821
+ "я",
822
+ "ы",
823
+ "з",
824
+ "б",
825
+ "й",
826
+ "ь",
827
+ "ч",
828
+ "х",
829
+ "ж",
830
+ "ц",
831
+ ],
832
+ # Jap-Kanji
833
+ "Japanese": [
834
+ "人",
835
+ "一",
836
+ "大",
837
+ "亅",
838
+ "丁",
839
+ "丨",
840
+ "竹",
841
+ "笑",
842
+ "口",
843
+ "日",
844
+ "今",
845
+ "二",
846
+ "彳",
847
+ "行",
848
+ "十",
849
+ "土",
850
+ "丶",
851
+ "寸",
852
+ "寺",
853
+ "時",
854
+ "乙",
855
+ "丿",
856
+ "乂",
857
+ "气",
858
+ "気",
859
+ "冂",
860
+ "巾",
861
+ "亠",
862
+ "市",
863
+ "目",
864
+ "儿",
865
+ "見",
866
+ "八",
867
+ "小",
868
+ "凵",
869
+ "県",
870
+ "月",
871
+ "彐",
872
+ "門",
873
+ "間",
874
+ "木",
875
+ "東",
876
+ "山",
877
+ "出",
878
+ "本",
879
+ "中",
880
+ "刀",
881
+ "分",
882
+ "耳",
883
+ "又",
884
+ "取",
885
+ "最",
886
+ "言",
887
+ "田",
888
+ "心",
889
+ "思",
890
+ "刂",
891
+ "前",
892
+ "京",
893
+ "尹",
894
+ "事",
895
+ "生",
896
+ "厶",
897
+ "云",
898
+ "会",
899
+ "未",
900
+ "来",
901
+ "白",
902
+ "冫",
903
+ "楽",
904
+ "灬",
905
+ "馬",
906
+ "尸",
907
+ "尺",
908
+ "駅",
909
+ "明",
910
+ "耂",
911
+ "者",
912
+ "了",
913
+ "阝",
914
+ "都",
915
+ "高",
916
+ "卜",
917
+ "占",
918
+ "厂",
919
+ "广",
920
+ "店",
921
+ "子",
922
+ "申",
923
+ "奄",
924
+ "亻",
925
+ "俺",
926
+ "上",
927
+ "方",
928
+ "冖",
929
+ "学",
930
+ "衣",
931
+ "艮",
932
+ "食",
933
+ "自",
934
+ ],
935
+ # Jap-Katakana
936
+ "Japanese—": [
937
+ "ー",
938
+ "ン",
939
+ "ス",
940
+ "・",
941
+ "ル",
942
+ "ト",
943
+ "リ",
944
+ "イ",
945
+ "ア",
946
+ "ラ",
947
+ "ッ",
948
+ "ク",
949
+ "ド",
950
+ "シ",
951
+ "レ",
952
+ "ジ",
953
+ "タ",
954
+ "フ",
955
+ "ロ",
956
+ "カ",
957
+ "テ",
958
+ "マ",
959
+ "ィ",
960
+ "グ",
961
+ "バ",
962
+ "ム",
963
+ "プ",
964
+ "オ",
965
+ "コ",
966
+ "デ",
967
+ "ニ",
968
+ "ウ",
969
+ "メ",
970
+ "サ",
971
+ "ビ",
972
+ "ナ",
973
+ "ブ",
974
+ "ャ",
975
+ "エ",
976
+ "ュ",
977
+ "チ",
978
+ "キ",
979
+ "ズ",
980
+ "ダ",
981
+ "パ",
982
+ "ミ",
983
+ "ェ",
984
+ "ョ",
985
+ "ハ",
986
+ "セ",
987
+ "ベ",
988
+ "ガ",
989
+ "モ",
990
+ "ツ",
991
+ "ネ",
992
+ "ボ",
993
+ "ソ",
994
+ "ノ",
995
+ "ァ",
996
+ "ヴ",
997
+ "ワ",
998
+ "ポ",
999
+ "ペ",
1000
+ "ピ",
1001
+ "ケ",
1002
+ "ゴ",
1003
+ "ギ",
1004
+ "ザ",
1005
+ "ホ",
1006
+ "ゲ",
1007
+ "ォ",
1008
+ "ヤ",
1009
+ "ヒ",
1010
+ "ユ",
1011
+ "ヨ",
1012
+ "ヘ",
1013
+ "ゼ",
1014
+ "ヌ",
1015
+ "ゥ",
1016
+ "ゾ",
1017
+ "ヶ",
1018
+ "ヂ",
1019
+ "ヲ",
1020
+ "ヅ",
1021
+ "ヵ",
1022
+ "ヱ",
1023
+ "ヰ",
1024
+ "ヮ",
1025
+ "ヽ",
1026
+ "゠",
1027
+ "ヾ",
1028
+ "ヷ",
1029
+ "ヿ",
1030
+ "ヸ",
1031
+ "ヹ",
1032
+ "ヺ",
1033
+ ],
1034
+ # Jap-Hiragana
1035
+ "Japanese——": [
1036
+ "の",
1037
+ "に",
1038
+ "る",
1039
+ "た",
1040
+ "と",
1041
+ "は",
1042
+ "し",
1043
+ "い",
1044
+ "を",
1045
+ "で",
1046
+ "て",
1047
+ "が",
1048
+ "な",
1049
+ "れ",
1050
+ "か",
1051
+ "ら",
1052
+ "さ",
1053
+ "っ",
1054
+ "り",
1055
+ "す",
1056
+ "あ",
1057
+ "も",
1058
+ "こ",
1059
+ "ま",
1060
+ "う",
1061
+ "く",
1062
+ "よ",
1063
+ "き",
1064
+ "ん",
1065
+ "め",
1066
+ "お",
1067
+ "け",
1068
+ "そ",
1069
+ "つ",
1070
+ "だ",
1071
+ "や",
1072
+ "え",
1073
+ "ど",
1074
+ "わ",
1075
+ "ち",
1076
+ "み",
1077
+ "せ",
1078
+ "じ",
1079
+ "ば",
1080
+ "へ",
1081
+ "び",
1082
+ "ず",
1083
+ "ろ",
1084
+ "ほ",
1085
+ "げ",
1086
+ "む",
1087
+ "べ",
1088
+ "ひ",
1089
+ "ょ",
1090
+ "ゆ",
1091
+ "ぶ",
1092
+ "ご",
1093
+ "ゃ",
1094
+ "ね",
1095
+ "ふ",
1096
+ "ぐ",
1097
+ "ぎ",
1098
+ "ぼ",
1099
+ "ゅ",
1100
+ "づ",
1101
+ "ざ",
1102
+ "ぞ",
1103
+ "ぬ",
1104
+ "ぜ",
1105
+ "ぱ",
1106
+ "ぽ",
1107
+ "ぷ",
1108
+ "ぴ",
1109
+ "ぃ",
1110
+ "ぁ",
1111
+ "ぇ",
1112
+ "ぺ",
1113
+ "ゞ",
1114
+ "ぢ",
1115
+ "ぉ",
1116
+ "ぅ",
1117
+ "ゐ",
1118
+ "ゝ",
1119
+ "ゑ",
1120
+ "゛",
1121
+ "゜",
1122
+ "ゎ",
1123
+ "ゔ",
1124
+ "゚",
1125
+ "ゟ",
1126
+ "゙",
1127
+ "ゕ",
1128
+ "ゖ",
1129
+ ],
1130
+ "Portuguese": [
1131
+ "a",
1132
+ "e",
1133
+ "o",
1134
+ "s",
1135
+ "i",
1136
+ "r",
1137
+ "d",
1138
+ "n",
1139
+ "t",
1140
+ "m",
1141
+ "u",
1142
+ "c",
1143
+ "l",
1144
+ "p",
1145
+ "g",
1146
+ "v",
1147
+ "b",
1148
+ "f",
1149
+ "h",
1150
+ "ã",
1151
+ "q",
1152
+ "é",
1153
+ "ç",
1154
+ "á",
1155
+ "z",
1156
+ "í",
1157
+ ],
1158
+ "Swedish": [
1159
+ "e",
1160
+ "a",
1161
+ "n",
1162
+ "r",
1163
+ "t",
1164
+ "s",
1165
+ "i",
1166
+ "l",
1167
+ "d",
1168
+ "o",
1169
+ "m",
1170
+ "k",
1171
+ "g",
1172
+ "v",
1173
+ "h",
1174
+ "f",
1175
+ "u",
1176
+ "p",
1177
+ "ä",
1178
+ "c",
1179
+ "b",
1180
+ "ö",
1181
+ "å",
1182
+ "y",
1183
+ "j",
1184
+ "x",
1185
+ ],
1186
+ "Chinese": [
1187
+ "的",
1188
+ "一",
1189
+ "是",
1190
+ "不",
1191
+ "了",
1192
+ "在",
1193
+ "人",
1194
+ "有",
1195
+ "我",
1196
+ "他",
1197
+ "这",
1198
+ "个",
1199
+ "们",
1200
+ "中",
1201
+ "来",
1202
+ "上",
1203
+ "大",
1204
+ "为",
1205
+ "和",
1206
+ "国",
1207
+ "地",
1208
+ "到",
1209
+ "以",
1210
+ "说",
1211
+ "时",
1212
+ "要",
1213
+ "就",
1214
+ "出",
1215
+ "会",
1216
+ "可",
1217
+ "也",
1218
+ "你",
1219
+ "对",
1220
+ "生",
1221
+ "能",
1222
+ "而",
1223
+ "子",
1224
+ "那",
1225
+ "得",
1226
+ "于",
1227
+ "着",
1228
+ "下",
1229
+ "自",
1230
+ "之",
1231
+ "年",
1232
+ "过",
1233
+ "发",
1234
+ "后",
1235
+ "作",
1236
+ "里",
1237
+ "用",
1238
+ "道",
1239
+ "行",
1240
+ "所",
1241
+ "然",
1242
+ "家",
1243
+ "种",
1244
+ "事",
1245
+ "成",
1246
+ "方",
1247
+ "多",
1248
+ "经",
1249
+ "么",
1250
+ "去",
1251
+ "法",
1252
+ "学",
1253
+ "如",
1254
+ "都",
1255
+ "同",
1256
+ "现",
1257
+ "当",
1258
+ "没",
1259
+ "动",
1260
+ "面",
1261
+ "起",
1262
+ "看",
1263
+ "定",
1264
+ "天",
1265
+ "分",
1266
+ "还",
1267
+ "进",
1268
+ "好",
1269
+ "小",
1270
+ "部",
1271
+ "其",
1272
+ "些",
1273
+ "主",
1274
+ "样",
1275
+ "理",
1276
+ "心",
1277
+ "她",
1278
+ "本",
1279
+ "前",
1280
+ "开",
1281
+ "但",
1282
+ "因",
1283
+ "只",
1284
+ "从",
1285
+ "想",
1286
+ "实",
1287
+ ],
1288
+ "Ukrainian": [
1289
+ "о",
1290
+ "а",
1291
+ "н",
1292
+ "і",
1293
+ "и",
1294
+ "р",
1295
+ "в",
1296
+ "т",
1297
+ "е",
1298
+ "с",
1299
+ "к",
1300
+ "л",
1301
+ "у",
1302
+ "д",
1303
+ "м",
1304
+ "п",
1305
+ "з",
1306
+ "я",
1307
+ "ь",
1308
+ "б",
1309
+ "г",
1310
+ "й",
1311
+ "ч",
1312
+ "х",
1313
+ "ц",
1314
+ "ї",
1315
+ ],
1316
+ "Norwegian": [
1317
+ "e",
1318
+ "r",
1319
+ "n",
1320
+ "t",
1321
+ "a",
1322
+ "s",
1323
+ "i",
1324
+ "o",
1325
+ "l",
1326
+ "d",
1327
+ "g",
1328
+ "k",
1329
+ "m",
1330
+ "v",
1331
+ "f",
1332
+ "p",
1333
+ "u",
1334
+ "b",
1335
+ "h",
1336
+ "å",
1337
+ "y",
1338
+ "j",
1339
+ "ø",
1340
+ "c",
1341
+ "æ",
1342
+ "w",
1343
+ ],
1344
+ "Finnish": [
1345
+ "a",
1346
+ "i",
1347
+ "n",
1348
+ "t",
1349
+ "e",
1350
+ "s",
1351
+ "l",
1352
+ "o",
1353
+ "u",
1354
+ "k",
1355
+ "ä",
1356
+ "m",
1357
+ "r",
1358
+ "v",
1359
+ "j",
1360
+ "h",
1361
+ "p",
1362
+ "y",
1363
+ "d",
1364
+ "ö",
1365
+ "g",
1366
+ "c",
1367
+ "b",
1368
+ "f",
1369
+ "w",
1370
+ "z",
1371
+ ],
1372
+ "Vietnamese": [
1373
+ "n",
1374
+ "h",
1375
+ "t",
1376
+ "i",
1377
+ "c",
1378
+ "g",
1379
+ "a",
1380
+ "o",
1381
+ "u",
1382
+ "m",
1383
+ "l",
1384
+ "r",
1385
+ "à",
1386
+ "đ",
1387
+ "s",
1388
+ "e",
1389
+ "v",
1390
+ "p",
1391
+ "b",
1392
+ "y",
1393
+ "ư",
1394
+ "d",
1395
+ "á",
1396
+ "k",
1397
+ "ộ",
1398
+ "ế",
1399
+ ],
1400
+ "Czech": [
1401
+ "o",
1402
+ "e",
1403
+ "a",
1404
+ "n",
1405
+ "t",
1406
+ "s",
1407
+ "i",
1408
+ "l",
1409
+ "v",
1410
+ "r",
1411
+ "k",
1412
+ "d",
1413
+ "u",
1414
+ "m",
1415
+ "p",
1416
+ "í",
1417
+ "c",
1418
+ "h",
1419
+ "z",
1420
+ "á",
1421
+ "y",
1422
+ "j",
1423
+ "b",
1424
+ "ě",
1425
+ "é",
1426
+ "ř",
1427
+ ],
1428
+ "Hungarian": [
1429
+ "e",
1430
+ "a",
1431
+ "t",
1432
+ "l",
1433
+ "s",
1434
+ "n",
1435
+ "k",
1436
+ "r",
1437
+ "i",
1438
+ "o",
1439
+ "z",
1440
+ "á",
1441
+ "é",
1442
+ "g",
1443
+ "m",
1444
+ "b",
1445
+ "y",
1446
+ "v",
1447
+ "d",
1448
+ "h",
1449
+ "u",
1450
+ "p",
1451
+ "j",
1452
+ "ö",
1453
+ "f",
1454
+ "c",
1455
+ ],
1456
+ "Korean": [
1457
+ "이",
1458
+ "다",
1459
+ "에",
1460
+ "의",
1461
+ "는",
1462
+ "로",
1463
+ "하",
1464
+ "을",
1465
+ "가",
1466
+ "고",
1467
+ "지",
1468
+ "서",
1469
+ "한",
1470
+ "은",
1471
+ "기",
1472
+ "으",
1473
+ "년",
1474
+ "대",
1475
+ "사",
1476
+ "시",
1477
+ "를",
1478
+ "리",
1479
+ "도",
1480
+ "인",
1481
+ "스",
1482
+ "일",
1483
+ ],
1484
+ "Indonesian": [
1485
+ "a",
1486
+ "n",
1487
+ "e",
1488
+ "i",
1489
+ "r",
1490
+ "t",
1491
+ "u",
1492
+ "s",
1493
+ "d",
1494
+ "k",
1495
+ "m",
1496
+ "l",
1497
+ "g",
1498
+ "p",
1499
+ "b",
1500
+ "o",
1501
+ "h",
1502
+ "y",
1503
+ "j",
1504
+ "c",
1505
+ "w",
1506
+ "f",
1507
+ "v",
1508
+ "z",
1509
+ "x",
1510
+ "q",
1511
+ ],
1512
+ "Turkish": [
1513
+ "a",
1514
+ "e",
1515
+ "i",
1516
+ "n",
1517
+ "r",
1518
+ "l",
1519
+ "ı",
1520
+ "k",
1521
+ "d",
1522
+ "t",
1523
+ "s",
1524
+ "m",
1525
+ "y",
1526
+ "u",
1527
+ "o",
1528
+ "b",
1529
+ "ü",
1530
+ "ş",
1531
+ "v",
1532
+ "g",
1533
+ "z",
1534
+ "h",
1535
+ "c",
1536
+ "p",
1537
+ "ç",
1538
+ "ğ",
1539
+ ],
1540
+ "Romanian": [
1541
+ "e",
1542
+ "i",
1543
+ "a",
1544
+ "r",
1545
+ "n",
1546
+ "t",
1547
+ "u",
1548
+ "l",
1549
+ "o",
1550
+ "c",
1551
+ "s",
1552
+ "d",
1553
+ "p",
1554
+ "m",
1555
+ "ă",
1556
+ "f",
1557
+ "v",
1558
+ "î",
1559
+ "g",
1560
+ "b",
1561
+ "ș",
1562
+ "ț",
1563
+ "z",
1564
+ "h",
1565
+ "â",
1566
+ "j",
1567
+ ],
1568
+ "Farsi": [
1569
+ "ا",
1570
+ "ی",
1571
+ "ر",
1572
+ "د",
1573
+ "ن",
1574
+ "ه",
1575
+ "و",
1576
+ "م",
1577
+ "ت",
1578
+ "ب",
1579
+ "س",
1580
+ "ل",
1581
+ "ک",
1582
+ "ش",
1583
+ "ز",
1584
+ "ف",
1585
+ "گ",
1586
+ "ع",
1587
+ "خ",
1588
+ "ق",
1589
+ "ج",
1590
+ "آ",
1591
+ "پ",
1592
+ "ح",
1593
+ "ط",
1594
+ "ص",
1595
+ ],
1596
+ "Arabic": [
1597
+ "ا",
1598
+ "ل",
1599
+ "ي",
1600
+ "م",
1601
+ "و",
1602
+ "ن",
1603
+ "ر",
1604
+ "ت",
1605
+ "ب",
1606
+ "ة",
1607
+ "ع",
1608
+ "د",
1609
+ "س",
1610
+ "ف",
1611
+ "ه",
1612
+ "ك",
1613
+ "ق",
1614
+ "أ",
1615
+ "ح",
1616
+ "ج",
1617
+ "ش",
1618
+ "ط",
1619
+ "ص",
1620
+ "ى",
1621
+ "خ",
1622
+ "إ",
1623
+ ],
1624
+ "Danish": [
1625
+ "e",
1626
+ "r",
1627
+ "n",
1628
+ "t",
1629
+ "a",
1630
+ "i",
1631
+ "s",
1632
+ "d",
1633
+ "l",
1634
+ "o",
1635
+ "g",
1636
+ "m",
1637
+ "k",
1638
+ "f",
1639
+ "v",
1640
+ "u",
1641
+ "b",
1642
+ "h",
1643
+ "p",
1644
+ "å",
1645
+ "y",
1646
+ "ø",
1647
+ "æ",
1648
+ "c",
1649
+ "j",
1650
+ "w",
1651
+ ],
1652
+ "Serbian": [
1653
+ "а",
1654
+ "и",
1655
+ "о",
1656
+ "е",
1657
+ "н",
1658
+ "р",
1659
+ "с",
1660
+ "у",
1661
+ "т",
1662
+ "к",
1663
+ "ј",
1664
+ "в",
1665
+ "д",
1666
+ "м",
1667
+ "п",
1668
+ "л",
1669
+ "г",
1670
+ "з",
1671
+ "б",
1672
+ "a",
1673
+ "i",
1674
+ "e",
1675
+ "o",
1676
+ "n",
1677
+ "ц",
1678
+ "ш",
1679
+ ],
1680
+ "Lithuanian": [
1681
+ "i",
1682
+ "a",
1683
+ "s",
1684
+ "o",
1685
+ "r",
1686
+ "e",
1687
+ "t",
1688
+ "n",
1689
+ "u",
1690
+ "k",
1691
+ "m",
1692
+ "l",
1693
+ "p",
1694
+ "v",
1695
+ "d",
1696
+ "j",
1697
+ "g",
1698
+ "ė",
1699
+ "b",
1700
+ "y",
1701
+ "ų",
1702
+ "š",
1703
+ "ž",
1704
+ "c",
1705
+ "ą",
1706
+ "į",
1707
+ ],
1708
+ "Slovene": [
1709
+ "e",
1710
+ "a",
1711
+ "i",
1712
+ "o",
1713
+ "n",
1714
+ "r",
1715
+ "s",
1716
+ "l",
1717
+ "t",
1718
+ "j",
1719
+ "v",
1720
+ "k",
1721
+ "d",
1722
+ "p",
1723
+ "m",
1724
+ "u",
1725
+ "z",
1726
+ "b",
1727
+ "g",
1728
+ "h",
1729
+ "č",
1730
+ "c",
1731
+ "š",
1732
+ "ž",
1733
+ "f",
1734
+ "y",
1735
+ ],
1736
+ "Slovak": [
1737
+ "o",
1738
+ "a",
1739
+ "e",
1740
+ "n",
1741
+ "i",
1742
+ "r",
1743
+ "v",
1744
+ "t",
1745
+ "s",
1746
+ "l",
1747
+ "k",
1748
+ "d",
1749
+ "m",
1750
+ "p",
1751
+ "u",
1752
+ "c",
1753
+ "h",
1754
+ "j",
1755
+ "b",
1756
+ "z",
1757
+ "á",
1758
+ "y",
1759
+ "ý",
1760
+ "í",
1761
+ "č",
1762
+ "é",
1763
+ ],
1764
+ "Hebrew": [
1765
+ "י",
1766
+ "ו",
1767
+ "ה",
1768
+ "ל",
1769
+ "ר",
1770
+ "ב",
1771
+ "ת",
1772
+ "מ",
1773
+ "א",
1774
+ "ש",
1775
+ "נ",
1776
+ "ע",
1777
+ "ם",
1778
+ "ד",
1779
+ "ק",
1780
+ "ח",
1781
+ "פ",
1782
+ "ס",
1783
+ "כ",
1784
+ "ג",
1785
+ "ט",
1786
+ "צ",
1787
+ "ן",
1788
+ "ז",
1789
+ "ך",
1790
+ ],
1791
+ "Bulgarian": [
1792
+ "а",
1793
+ "и",
1794
+ "о",
1795
+ "е",
1796
+ "н",
1797
+ "т",
1798
+ "р",
1799
+ "с",
1800
+ "в",
1801
+ "л",
1802
+ "к",
1803
+ "д",
1804
+ "п",
1805
+ "м",
1806
+ "з",
1807
+ "г",
1808
+ "я",
1809
+ "ъ",
1810
+ "у",
1811
+ "б",
1812
+ "ч",
1813
+ "ц",
1814
+ "й",
1815
+ "ж",
1816
+ "щ",
1817
+ "х",
1818
+ ],
1819
+ "Croatian": [
1820
+ "a",
1821
+ "i",
1822
+ "o",
1823
+ "e",
1824
+ "n",
1825
+ "r",
1826
+ "j",
1827
+ "s",
1828
+ "t",
1829
+ "u",
1830
+ "k",
1831
+ "l",
1832
+ "v",
1833
+ "d",
1834
+ "m",
1835
+ "p",
1836
+ "g",
1837
+ "z",
1838
+ "b",
1839
+ "c",
1840
+ "č",
1841
+ "h",
1842
+ "š",
1843
+ "ž",
1844
+ "ć",
1845
+ "f",
1846
+ ],
1847
+ "Hindi": [
1848
+ "क",
1849
+ "र",
1850
+ "स",
1851
+ "न",
1852
+ "त",
1853
+ "म",
1854
+ "ह",
1855
+ "प",
1856
+ "य",
1857
+ "ल",
1858
+ "व",
1859
+ "ज",
1860
+ "द",
1861
+ "ग",
1862
+ "ब",
1863
+ "श",
1864
+ "ट",
1865
+ "अ",
1866
+ "ए",
1867
+ "थ",
1868
+ "भ",
1869
+ "ड",
1870
+ "च",
1871
+ "ध",
1872
+ "ष",
1873
+ "इ",
1874
+ ],
1875
+ "Estonian": [
1876
+ "a",
1877
+ "i",
1878
+ "e",
1879
+ "s",
1880
+ "t",
1881
+ "l",
1882
+ "u",
1883
+ "n",
1884
+ "o",
1885
+ "k",
1886
+ "r",
1887
+ "d",
1888
+ "m",
1889
+ "v",
1890
+ "g",
1891
+ "p",
1892
+ "j",
1893
+ "h",
1894
+ "ä",
1895
+ "b",
1896
+ "õ",
1897
+ "ü",
1898
+ "f",
1899
+ "c",
1900
+ "ö",
1901
+ "y",
1902
+ ],
1903
+ "Thai": [
1904
+ "า",
1905
+ "น",
1906
+ "ร",
1907
+ "อ",
1908
+ "ก",
1909
+ "เ",
1910
+ "ง",
1911
+ "ม",
1912
+ "ย",
1913
+ "ล",
1914
+ "ว",
1915
+ "ด",
1916
+ "ท",
1917
+ "ส",
1918
+ "ต",
1919
+ "ะ",
1920
+ "ป",
1921
+ "บ",
1922
+ "ค",
1923
+ "ห",
1924
+ "แ",
1925
+ "จ",
1926
+ "พ",
1927
+ "ช",
1928
+ "ข",
1929
+ "ใ",
1930
+ ],
1931
+ "Greek": [
1932
+ "α",
1933
+ "τ",
1934
+ "ο",
1935
+ "ι",
1936
+ "ε",
1937
+ "ν",
1938
+ "ρ",
1939
+ "σ",
1940
+ "κ",
1941
+ "η",
1942
+ "π",
1943
+ "ς",
1944
+ "υ",
1945
+ "μ",
1946
+ "λ",
1947
+ "ί",
1948
+ "ό",
1949
+ "ά",
1950
+ "γ",
1951
+ "έ",
1952
+ "δ",
1953
+ "ή",
1954
+ "ω",
1955
+ "χ",
1956
+ "θ",
1957
+ "ύ",
1958
+ ],
1959
+ "Tamil": [
1960
+ "க",
1961
+ "த",
1962
+ "ப",
1963
+ "ட",
1964
+ "ர",
1965
+ "ம",
1966
+ "ல",
1967
+ "ன",
1968
+ "வ",
1969
+ "ற",
1970
+ "ய",
1971
+ "ள",
1972
+ "ச",
1973
+ "ந",
1974
+ "இ",
1975
+ "ண",
1976
+ "அ",
1977
+ "ஆ",
1978
+ "ழ",
1979
+ "ங",
1980
+ "எ",
1981
+ "உ",
1982
+ "ஒ",
1983
+ "ஸ",
1984
+ ],
1985
+ "Kazakh": [
1986
+ "а",
1987
+ "ы",
1988
+ "е",
1989
+ "н",
1990
+ "т",
1991
+ "р",
1992
+ "л",
1993
+ "і",
1994
+ "д",
1995
+ "с",
1996
+ "м",
1997
+ "қ",
1998
+ "к",
1999
+ "о",
2000
+ "б",
2001
+ "и",
2002
+ "у",
2003
+ "ғ",
2004
+ "ж",
2005
+ "ң",
2006
+ "з",
2007
+ "ш",
2008
+ "й",
2009
+ "п",
2010
+ "г",
2011
+ "ө",
2012
+ ],
2013
+ }
2014
+
2015
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
venv/lib/python3.13/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
8
+
9
+ # TODO: remove this check when dropping Python 3.7 support
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import TypedDict
12
+
13
+ class ResultDict(TypedDict):
14
+ encoding: str | None
15
+ language: str
16
+ confidence: float | None
17
+
18
+
19
+ def detect(
20
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
+ ) -> ResultDict:
22
+ """
23
+ chardet legacy method
24
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
+ further information. Not planned for removal.
28
+
29
+ :param byte_str: The byte sequence to examine.
30
+ :param should_rename_legacy: Should we rename legacy encodings
31
+ to their more modern equivalents?
32
+ """
33
+ if len(kwargs):
34
+ warn(
35
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
+ )
37
+
38
+ if not isinstance(byte_str, (bytearray, bytes)):
39
+ raise TypeError( # pragma: nocover
40
+ f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
41
+ )
42
+
43
+ if isinstance(byte_str, bytearray):
44
+ byte_str = bytes(byte_str)
45
+
46
+ r = from_bytes(byte_str).best()
47
+
48
+ encoding = r.encoding if r is not None else None
49
+ language = r.language if r is not None and r.language != "Unknown" else ""
50
+ confidence = 1.0 - r.chaos if r is not None else None
51
+
52
+ # automatically lower confidence
53
+ # on small bytes samples.
54
+ # https://github.com/jawah/charset_normalizer/issues/391
55
+ if (
56
+ confidence is not None
57
+ and confidence >= 0.9
58
+ and encoding
59
+ not in {
60
+ "utf_8",
61
+ "ascii",
62
+ }
63
+ and r.bom is False # type: ignore[union-attr]
64
+ and len(byte_str) < TOO_SMALL_SEQUENCE
65
+ ):
66
+ confidence -= 0.2
67
+
68
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
69
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
70
+ if r is not None and encoding == "utf_8" and r.bom:
71
+ encoding += "_sig"
72
+
73
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
74
+ encoding = CHARDET_CORRESPONDENCE[encoding]
75
+
76
+ return {
77
+ "encoding": encoding,
78
+ "language": language,
79
+ "confidence": confidence,
80
+ }
venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-x86_64-linux-gnu.so ADDED
Binary file (15.9 kB). View file
 
venv/lib/python3.13/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from logging import getLogger
5
+
6
+ from .constant import (
7
+ COMMON_SAFE_ASCII_CHARACTERS,
8
+ TRACE,
9
+ UNICODE_SECONDARY_RANGE_KEYWORD,
10
+ )
11
+ from .utils import (
12
+ is_accentuated,
13
+ is_arabic,
14
+ is_arabic_isolated_form,
15
+ is_case_variable,
16
+ is_cjk,
17
+ is_emoticon,
18
+ is_hangul,
19
+ is_hiragana,
20
+ is_katakana,
21
+ is_latin,
22
+ is_punctuation,
23
+ is_separator,
24
+ is_symbol,
25
+ is_thai,
26
+ is_unprintable,
27
+ remove_accent,
28
+ unicode_range,
29
+ is_cjk_uncommon,
30
+ )
31
+
32
+
33
+ class MessDetectorPlugin:
34
+ """
35
+ Base abstract class used for mess detection plugins.
36
+ All detectors MUST extend and implement given methods.
37
+ """
38
+
39
+ def eligible(self, character: str) -> bool:
40
+ """
41
+ Determine if given character should be fed in.
42
+ """
43
+ raise NotImplementedError # pragma: nocover
44
+
45
+ def feed(self, character: str) -> None:
46
+ """
47
+ The main routine to be executed upon character.
48
+ Insert the logic in witch the text would be considered chaotic.
49
+ """
50
+ raise NotImplementedError # pragma: nocover
51
+
52
+ def reset(self) -> None: # pragma: no cover
53
+ """
54
+ Permit to reset the plugin to the initial state.
55
+ """
56
+ raise NotImplementedError
57
+
58
+ @property
59
+ def ratio(self) -> float:
60
+ """
61
+ Compute the chaos ratio based on what your feed() has seen.
62
+ Must NOT be lower than 0.; No restriction gt 0.
63
+ """
64
+ raise NotImplementedError # pragma: nocover
65
+
66
+
67
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
68
+ def __init__(self) -> None:
69
+ self._punctuation_count: int = 0
70
+ self._symbol_count: int = 0
71
+ self._character_count: int = 0
72
+
73
+ self._last_printable_char: str | None = None
74
+ self._frenzy_symbol_in_word: bool = False
75
+
76
+ def eligible(self, character: str) -> bool:
77
+ return character.isprintable()
78
+
79
+ def feed(self, character: str) -> None:
80
+ self._character_count += 1
81
+
82
+ if (
83
+ character != self._last_printable_char
84
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
85
+ ):
86
+ if is_punctuation(character):
87
+ self._punctuation_count += 1
88
+ elif (
89
+ character.isdigit() is False
90
+ and is_symbol(character)
91
+ and is_emoticon(character) is False
92
+ ):
93
+ self._symbol_count += 2
94
+
95
+ self._last_printable_char = character
96
+
97
+ def reset(self) -> None: # Abstract
98
+ self._punctuation_count = 0
99
+ self._character_count = 0
100
+ self._symbol_count = 0
101
+
102
+ @property
103
+ def ratio(self) -> float:
104
+ if self._character_count == 0:
105
+ return 0.0
106
+
107
+ ratio_of_punctuation: float = (
108
+ self._punctuation_count + self._symbol_count
109
+ ) / self._character_count
110
+
111
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
112
+
113
+
114
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
115
+ def __init__(self) -> None:
116
+ self._character_count: int = 0
117
+ self._accentuated_count: int = 0
118
+
119
+ def eligible(self, character: str) -> bool:
120
+ return character.isalpha()
121
+
122
+ def feed(self, character: str) -> None:
123
+ self._character_count += 1
124
+
125
+ if is_accentuated(character):
126
+ self._accentuated_count += 1
127
+
128
+ def reset(self) -> None: # Abstract
129
+ self._character_count = 0
130
+ self._accentuated_count = 0
131
+
132
+ @property
133
+ def ratio(self) -> float:
134
+ if self._character_count < 8:
135
+ return 0.0
136
+
137
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
138
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
139
+
140
+
141
+ class UnprintablePlugin(MessDetectorPlugin):
142
+ def __init__(self) -> None:
143
+ self._unprintable_count: int = 0
144
+ self._character_count: int = 0
145
+
146
+ def eligible(self, character: str) -> bool:
147
+ return True
148
+
149
+ def feed(self, character: str) -> None:
150
+ if is_unprintable(character):
151
+ self._unprintable_count += 1
152
+ self._character_count += 1
153
+
154
+ def reset(self) -> None: # Abstract
155
+ self._unprintable_count = 0
156
+
157
+ @property
158
+ def ratio(self) -> float:
159
+ if self._character_count == 0:
160
+ return 0.0
161
+
162
+ return (self._unprintable_count * 8) / self._character_count
163
+
164
+
165
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
166
+ def __init__(self) -> None:
167
+ self._successive_count: int = 0
168
+ self._character_count: int = 0
169
+
170
+ self._last_latin_character: str | None = None
171
+
172
+ def eligible(self, character: str) -> bool:
173
+ return character.isalpha() and is_latin(character)
174
+
175
+ def feed(self, character: str) -> None:
176
+ self._character_count += 1
177
+ if (
178
+ self._last_latin_character is not None
179
+ and is_accentuated(character)
180
+ and is_accentuated(self._last_latin_character)
181
+ ):
182
+ if character.isupper() and self._last_latin_character.isupper():
183
+ self._successive_count += 1
184
+ # Worse if its the same char duplicated with different accent.
185
+ if remove_accent(character) == remove_accent(self._last_latin_character):
186
+ self._successive_count += 1
187
+ self._last_latin_character = character
188
+
189
+ def reset(self) -> None: # Abstract
190
+ self._successive_count = 0
191
+ self._character_count = 0
192
+ self._last_latin_character = None
193
+
194
+ @property
195
+ def ratio(self) -> float:
196
+ if self._character_count == 0:
197
+ return 0.0
198
+
199
+ return (self._successive_count * 2) / self._character_count
200
+
201
+
202
+ class SuspiciousRange(MessDetectorPlugin):
203
+ def __init__(self) -> None:
204
+ self._suspicious_successive_range_count: int = 0
205
+ self._character_count: int = 0
206
+ self._last_printable_seen: str | None = None
207
+
208
+ def eligible(self, character: str) -> bool:
209
+ return character.isprintable()
210
+
211
+ def feed(self, character: str) -> None:
212
+ self._character_count += 1
213
+
214
+ if (
215
+ character.isspace()
216
+ or is_punctuation(character)
217
+ or character in COMMON_SAFE_ASCII_CHARACTERS
218
+ ):
219
+ self._last_printable_seen = None
220
+ return
221
+
222
+ if self._last_printable_seen is None:
223
+ self._last_printable_seen = character
224
+ return
225
+
226
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
227
+ unicode_range_b: str | None = unicode_range(character)
228
+
229
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
230
+ self._suspicious_successive_range_count += 1
231
+
232
+ self._last_printable_seen = character
233
+
234
+ def reset(self) -> None: # Abstract
235
+ self._character_count = 0
236
+ self._suspicious_successive_range_count = 0
237
+ self._last_printable_seen = None
238
+
239
+ @property
240
+ def ratio(self) -> float:
241
+ if self._character_count <= 13:
242
+ return 0.0
243
+
244
+ ratio_of_suspicious_range_usage: float = (
245
+ self._suspicious_successive_range_count * 2
246
+ ) / self._character_count
247
+
248
+ return ratio_of_suspicious_range_usage
249
+
250
+
251
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
252
+ def __init__(self) -> None:
253
+ self._word_count: int = 0
254
+ self._bad_word_count: int = 0
255
+ self._foreign_long_count: int = 0
256
+
257
+ self._is_current_word_bad: bool = False
258
+ self._foreign_long_watch: bool = False
259
+
260
+ self._character_count: int = 0
261
+ self._bad_character_count: int = 0
262
+
263
+ self._buffer: str = ""
264
+ self._buffer_accent_count: int = 0
265
+ self._buffer_glyph_count: int = 0
266
+
267
+ def eligible(self, character: str) -> bool:
268
+ return True
269
+
270
+ def feed(self, character: str) -> None:
271
+ if character.isalpha():
272
+ self._buffer += character
273
+ if is_accentuated(character):
274
+ self._buffer_accent_count += 1
275
+ if (
276
+ self._foreign_long_watch is False
277
+ and (is_latin(character) is False or is_accentuated(character))
278
+ and is_cjk(character) is False
279
+ and is_hangul(character) is False
280
+ and is_katakana(character) is False
281
+ and is_hiragana(character) is False
282
+ and is_thai(character) is False
283
+ ):
284
+ self._foreign_long_watch = True
285
+ if (
286
+ is_cjk(character)
287
+ or is_hangul(character)
288
+ or is_katakana(character)
289
+ or is_hiragana(character)
290
+ or is_thai(character)
291
+ ):
292
+ self._buffer_glyph_count += 1
293
+ return
294
+ if not self._buffer:
295
+ return
296
+ if (
297
+ character.isspace() or is_punctuation(character) or is_separator(character)
298
+ ) and self._buffer:
299
+ self._word_count += 1
300
+ buffer_length: int = len(self._buffer)
301
+
302
+ self._character_count += buffer_length
303
+
304
+ if buffer_length >= 4:
305
+ if self._buffer_accent_count / buffer_length >= 0.5:
306
+ self._is_current_word_bad = True
307
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
308
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
309
+ elif (
310
+ is_accentuated(self._buffer[-1])
311
+ and self._buffer[-1].isupper()
312
+ and all(_.isupper() for _ in self._buffer) is False
313
+ ):
314
+ self._foreign_long_count += 1
315
+ self._is_current_word_bad = True
316
+ elif self._buffer_glyph_count == 1:
317
+ self._is_current_word_bad = True
318
+ self._foreign_long_count += 1
319
+ if buffer_length >= 24 and self._foreign_long_watch:
320
+ camel_case_dst = [
321
+ i
322
+ for c, i in zip(self._buffer, range(0, buffer_length))
323
+ if c.isupper()
324
+ ]
325
+ probable_camel_cased: bool = False
326
+
327
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
328
+ probable_camel_cased = True
329
+
330
+ if not probable_camel_cased:
331
+ self._foreign_long_count += 1
332
+ self._is_current_word_bad = True
333
+
334
+ if self._is_current_word_bad:
335
+ self._bad_word_count += 1
336
+ self._bad_character_count += len(self._buffer)
337
+ self._is_current_word_bad = False
338
+
339
+ self._foreign_long_watch = False
340
+ self._buffer = ""
341
+ self._buffer_accent_count = 0
342
+ self._buffer_glyph_count = 0
343
+ elif (
344
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
345
+ and character.isdigit() is False
346
+ and is_symbol(character)
347
+ ):
348
+ self._is_current_word_bad = True
349
+ self._buffer += character
350
+
351
+ def reset(self) -> None: # Abstract
352
+ self._buffer = ""
353
+ self._is_current_word_bad = False
354
+ self._foreign_long_watch = False
355
+ self._bad_word_count = 0
356
+ self._word_count = 0
357
+ self._character_count = 0
358
+ self._bad_character_count = 0
359
+ self._foreign_long_count = 0
360
+
361
+ @property
362
+ def ratio(self) -> float:
363
+ if self._word_count <= 10 and self._foreign_long_count == 0:
364
+ return 0.0
365
+
366
+ return self._bad_character_count / self._character_count
367
+
368
+
369
+ class CjkUncommonPlugin(MessDetectorPlugin):
370
+ """
371
+ Detect messy CJK text that probably means nothing.
372
+ """
373
+
374
+ def __init__(self) -> None:
375
+ self._character_count: int = 0
376
+ self._uncommon_count: int = 0
377
+
378
+ def eligible(self, character: str) -> bool:
379
+ return is_cjk(character)
380
+
381
+ def feed(self, character: str) -> None:
382
+ self._character_count += 1
383
+
384
+ if is_cjk_uncommon(character):
385
+ self._uncommon_count += 1
386
+ return
387
+
388
+ def reset(self) -> None: # Abstract
389
+ self._character_count = 0
390
+ self._uncommon_count = 0
391
+
392
+ @property
393
+ def ratio(self) -> float:
394
+ if self._character_count < 8:
395
+ return 0.0
396
+
397
+ uncommon_form_usage: float = self._uncommon_count / self._character_count
398
+
399
+ # we can be pretty sure it's garbage when uncommon characters are widely
400
+ # used. otherwise it could just be traditional chinese for example.
401
+ return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
402
+
403
+
404
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
405
+ def __init__(self) -> None:
406
+ self._buf: bool = False
407
+
408
+ self._character_count_since_last_sep: int = 0
409
+
410
+ self._successive_upper_lower_count: int = 0
411
+ self._successive_upper_lower_count_final: int = 0
412
+
413
+ self._character_count: int = 0
414
+
415
+ self._last_alpha_seen: str | None = None
416
+ self._current_ascii_only: bool = True
417
+
418
+ def eligible(self, character: str) -> bool:
419
+ return True
420
+
421
+ def feed(self, character: str) -> None:
422
+ is_concerned = character.isalpha() and is_case_variable(character)
423
+ chunk_sep = is_concerned is False
424
+
425
+ if chunk_sep and self._character_count_since_last_sep > 0:
426
+ if (
427
+ self._character_count_since_last_sep <= 64
428
+ and character.isdigit() is False
429
+ and self._current_ascii_only is False
430
+ ):
431
+ self._successive_upper_lower_count_final += (
432
+ self._successive_upper_lower_count
433
+ )
434
+
435
+ self._successive_upper_lower_count = 0
436
+ self._character_count_since_last_sep = 0
437
+ self._last_alpha_seen = None
438
+ self._buf = False
439
+ self._character_count += 1
440
+ self._current_ascii_only = True
441
+
442
+ return
443
+
444
+ if self._current_ascii_only is True and character.isascii() is False:
445
+ self._current_ascii_only = False
446
+
447
+ if self._last_alpha_seen is not None:
448
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
449
+ character.islower() and self._last_alpha_seen.isupper()
450
+ ):
451
+ if self._buf is True:
452
+ self._successive_upper_lower_count += 2
453
+ self._buf = False
454
+ else:
455
+ self._buf = True
456
+ else:
457
+ self._buf = False
458
+
459
+ self._character_count += 1
460
+ self._character_count_since_last_sep += 1
461
+ self._last_alpha_seen = character
462
+
463
+ def reset(self) -> None: # Abstract
464
+ self._character_count = 0
465
+ self._character_count_since_last_sep = 0
466
+ self._successive_upper_lower_count = 0
467
+ self._successive_upper_lower_count_final = 0
468
+ self._last_alpha_seen = None
469
+ self._buf = False
470
+ self._current_ascii_only = True
471
+
472
+ @property
473
+ def ratio(self) -> float:
474
+ if self._character_count == 0:
475
+ return 0.0
476
+
477
+ return self._successive_upper_lower_count_final / self._character_count
478
+
479
+
480
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
481
+ def __init__(self) -> None:
482
+ self._character_count: int = 0
483
+ self._isolated_form_count: int = 0
484
+
485
+ def reset(self) -> None: # Abstract
486
+ self._character_count = 0
487
+ self._isolated_form_count = 0
488
+
489
+ def eligible(self, character: str) -> bool:
490
+ return is_arabic(character)
491
+
492
+ def feed(self, character: str) -> None:
493
+ self._character_count += 1
494
+
495
+ if is_arabic_isolated_form(character):
496
+ self._isolated_form_count += 1
497
+
498
+ @property
499
+ def ratio(self) -> float:
500
+ if self._character_count < 8:
501
+ return 0.0
502
+
503
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
504
+
505
+ return isolated_form_usage
506
+
507
+
508
+ @lru_cache(maxsize=1024)
509
+ def is_suspiciously_successive_range(
510
+ unicode_range_a: str | None, unicode_range_b: str | None
511
+ ) -> bool:
512
+ """
513
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
514
+ """
515
+ if unicode_range_a is None or unicode_range_b is None:
516
+ return True
517
+
518
+ if unicode_range_a == unicode_range_b:
519
+ return False
520
+
521
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
522
+ return False
523
+
524
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
525
+ return False
526
+
527
+ # Latin characters can be accompanied with a combining diacritical mark
528
+ # eg. Vietnamese.
529
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
530
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
531
+ ):
532
+ return False
533
+
534
+ keywords_range_a, keywords_range_b = (
535
+ unicode_range_a.split(" "),
536
+ unicode_range_b.split(" "),
537
+ )
538
+
539
+ for el in keywords_range_a:
540
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
541
+ continue
542
+ if el in keywords_range_b:
543
+ return False
544
+
545
+ # Japanese Exception
546
+ range_a_jp_chars, range_b_jp_chars = (
547
+ unicode_range_a
548
+ in (
549
+ "Hiragana",
550
+ "Katakana",
551
+ ),
552
+ unicode_range_b in ("Hiragana", "Katakana"),
553
+ )
554
+ if (range_a_jp_chars or range_b_jp_chars) and (
555
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
556
+ ):
557
+ return False
558
+ if range_a_jp_chars and range_b_jp_chars:
559
+ return False
560
+
561
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
562
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
563
+ return False
564
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
565
+ return False
566
+
567
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
568
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
569
+ unicode_range_a in ["Katakana", "Hiragana"]
570
+ and unicode_range_b in ["Katakana", "Hiragana"]
571
+ ):
572
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
573
+ return False
574
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
575
+ return False
576
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
577
+ return False
578
+
579
+ return True
580
+
581
+
582
+ @lru_cache(maxsize=2048)
583
+ def mess_ratio(
584
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
585
+ ) -> float:
586
+ """
587
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
588
+ """
589
+
590
+ detectors: list[MessDetectorPlugin] = [
591
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
592
+ ]
593
+
594
+ length: int = len(decoded_sequence) + 1
595
+
596
+ mean_mess_ratio: float = 0.0
597
+
598
+ if length < 512:
599
+ intermediary_mean_mess_ratio_calc: int = 32
600
+ elif length <= 1024:
601
+ intermediary_mean_mess_ratio_calc = 64
602
+ else:
603
+ intermediary_mean_mess_ratio_calc = 128
604
+
605
+ for character, index in zip(decoded_sequence + "\n", range(length)):
606
+ for detector in detectors:
607
+ if detector.eligible(character):
608
+ detector.feed(character)
609
+
610
+ if (
611
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
612
+ ) or index == length - 1:
613
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
614
+
615
+ if mean_mess_ratio >= maximum_threshold:
616
+ break
617
+
618
+ if debug:
619
+ logger = getLogger("charset_normalizer")
620
+
621
+ logger.log(
622
+ TRACE,
623
+ "Mess-detector extended-analysis start. "
624
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
625
+ f"maximum_threshold={maximum_threshold}",
626
+ )
627
+
628
+ if len(decoded_sequence) > 16:
629
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
630
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
631
+
632
+ for dt in detectors:
633
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
634
+
635
+ return round(mean_mess_ratio, 3)
venv/lib/python3.13/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from encodings.aliases import aliases
4
+ from hashlib import sha256
5
+ from json import dumps
6
+ from re import sub
7
+ from typing import Any, Iterator, List, Tuple
8
+
9
+ from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
+
12
+
13
+ class CharsetMatch:
14
+ def __init__(
15
+ self,
16
+ payload: bytes,
17
+ guessed_encoding: str,
18
+ mean_mess_ratio: float,
19
+ has_sig_or_bom: bool,
20
+ languages: CoherenceMatches,
21
+ decoded_payload: str | None = None,
22
+ preemptive_declaration: str | None = None,
23
+ ):
24
+ self._payload: bytes = payload
25
+
26
+ self._encoding: str = guessed_encoding
27
+ self._mean_mess_ratio: float = mean_mess_ratio
28
+ self._languages: CoherenceMatches = languages
29
+ self._has_sig_or_bom: bool = has_sig_or_bom
30
+ self._unicode_ranges: list[str] | None = None
31
+
32
+ self._leaves: list[CharsetMatch] = []
33
+ self._mean_coherence_ratio: float = 0.0
34
+
35
+ self._output_payload: bytes | None = None
36
+ self._output_encoding: str | None = None
37
+
38
+ self._string: str | None = decoded_payload
39
+
40
+ self._preemptive_declaration: str | None = preemptive_declaration
41
+
42
+ def __eq__(self, other: object) -> bool:
43
+ if not isinstance(other, CharsetMatch):
44
+ if isinstance(other, str):
45
+ return iana_name(other) == self.encoding
46
+ return False
47
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
+
49
+ def __lt__(self, other: object) -> bool:
50
+ """
51
+ Implemented to make sorted available upon CharsetMatches items.
52
+ """
53
+ if not isinstance(other, CharsetMatch):
54
+ raise ValueError
55
+
56
+ chaos_difference: float = abs(self.chaos - other.chaos)
57
+ coherence_difference: float = abs(self.coherence - other.coherence)
58
+
59
+ # Below 1% difference --> Use Coherence
60
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
61
+ return self.coherence > other.coherence
62
+ elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64
+ # preserve RAM usage!
65
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
66
+ return self.chaos < other.chaos
67
+ return self.multi_byte_usage > other.multi_byte_usage
68
+
69
+ return self.chaos < other.chaos
70
+
71
+ @property
72
+ def multi_byte_usage(self) -> float:
73
+ return 1.0 - (len(str(self)) / len(self.raw))
74
+
75
+ def __str__(self) -> str:
76
+ # Lazy Str Loading
77
+ if self._string is None:
78
+ self._string = str(self._payload, self._encoding, "strict")
79
+ return self._string
80
+
81
+ def __repr__(self) -> str:
82
+ return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
+
84
+ def add_submatch(self, other: CharsetMatch) -> None:
85
+ if not isinstance(other, CharsetMatch) or other == self:
86
+ raise ValueError(
87
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88
+ other.__class__
89
+ )
90
+ )
91
+
92
+ other._string = None # Unload RAM usage; dirty trick.
93
+ self._leaves.append(other)
94
+
95
+ @property
96
+ def encoding(self) -> str:
97
+ return self._encoding
98
+
99
+ @property
100
+ def encoding_aliases(self) -> list[str]:
101
+ """
102
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103
+ """
104
+ also_known_as: list[str] = []
105
+ for u, p in aliases.items():
106
+ if self.encoding == u:
107
+ also_known_as.append(p)
108
+ elif self.encoding == p:
109
+ also_known_as.append(u)
110
+ return also_known_as
111
+
112
+ @property
113
+ def bom(self) -> bool:
114
+ return self._has_sig_or_bom
115
+
116
+ @property
117
+ def byte_order_mark(self) -> bool:
118
+ return self._has_sig_or_bom
119
+
120
+ @property
121
+ def languages(self) -> list[str]:
122
+ """
123
+ Return the complete list of possible languages found in decoded sequence.
124
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125
+ """
126
+ return [e[0] for e in self._languages]
127
+
128
+ @property
129
+ def language(self) -> str:
130
+ """
131
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132
+ "Unknown".
133
+ """
134
+ if not self._languages:
135
+ # Trying to infer the language based on the given encoding
136
+ # Its either English or we should not pronounce ourselves in certain cases.
137
+ if "ascii" in self.could_be_from_charset:
138
+ return "English"
139
+
140
+ # doing it there to avoid circular import
141
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
+
143
+ languages = (
144
+ mb_encoding_languages(self.encoding)
145
+ if is_multi_byte_encoding(self.encoding)
146
+ else encoding_languages(self.encoding)
147
+ )
148
+
149
+ if len(languages) == 0 or "Latin Based" in languages:
150
+ return "Unknown"
151
+
152
+ return languages[0]
153
+
154
+ return self._languages[0][0]
155
+
156
+ @property
157
+ def chaos(self) -> float:
158
+ return self._mean_mess_ratio
159
+
160
+ @property
161
+ def coherence(self) -> float:
162
+ if not self._languages:
163
+ return 0.0
164
+ return self._languages[0][1]
165
+
166
+ @property
167
+ def percent_chaos(self) -> float:
168
+ return round(self.chaos * 100, ndigits=3)
169
+
170
+ @property
171
+ def percent_coherence(self) -> float:
172
+ return round(self.coherence * 100, ndigits=3)
173
+
174
+ @property
175
+ def raw(self) -> bytes:
176
+ """
177
+ Original untouched bytes.
178
+ """
179
+ return self._payload
180
+
181
+ @property
182
+ def submatch(self) -> list[CharsetMatch]:
183
+ return self._leaves
184
+
185
+ @property
186
+ def has_submatch(self) -> bool:
187
+ return len(self._leaves) > 0
188
+
189
+ @property
190
+ def alphabets(self) -> list[str]:
191
+ if self._unicode_ranges is not None:
192
+ return self._unicode_ranges
193
+ # list detected ranges
194
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195
+ # filter and sort
196
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197
+ return self._unicode_ranges
198
+
199
+ @property
200
+ def could_be_from_charset(self) -> list[str]:
201
+ """
202
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
203
+ encoding.
204
+ This list does include the encoding available in property 'encoding'.
205
+ """
206
+ return [self._encoding] + [m.encoding for m in self._leaves]
207
+
208
+ def output(self, encoding: str = "utf_8") -> bytes:
209
+ """
210
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211
+ Any errors will be simply ignored by the encoder NOT replaced.
212
+ """
213
+ if self._output_encoding is None or self._output_encoding != encoding:
214
+ self._output_encoding = encoding
215
+ decoded_string = str(self)
216
+ if (
217
+ self._preemptive_declaration is not None
218
+ and self._preemptive_declaration.lower()
219
+ not in ["utf-8", "utf8", "utf_8"]
220
+ ):
221
+ patched_header = sub(
222
+ RE_POSSIBLE_ENCODING_INDICATION,
223
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224
+ m.groups()[0],
225
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226
+ ),
227
+ decoded_string[:8192],
228
+ count=1,
229
+ )
230
+
231
+ decoded_string = patched_header + decoded_string[8192:]
232
+
233
+ self._output_payload = decoded_string.encode(encoding, "replace")
234
+
235
+ return self._output_payload # type: ignore
236
+
237
+ @property
238
+ def fingerprint(self) -> str:
239
+ """
240
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241
+ """
242
+ return sha256(self.output()).hexdigest()
243
+
244
+
245
+ class CharsetMatches:
246
+ """
247
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
248
+ Act like a list(iterable) but does not implements all related methods.
249
+ """
250
+
251
+ def __init__(self, results: list[CharsetMatch] | None = None):
252
+ self._results: list[CharsetMatch] = sorted(results) if results else []
253
+
254
+ def __iter__(self) -> Iterator[CharsetMatch]:
255
+ yield from self._results
256
+
257
+ def __getitem__(self, item: int | str) -> CharsetMatch:
258
+ """
259
+ Retrieve a single item either by its position or encoding name (alias may be used here).
260
+ Raise KeyError upon invalid index or encoding not present in results.
261
+ """
262
+ if isinstance(item, int):
263
+ return self._results[item]
264
+ if isinstance(item, str):
265
+ item = iana_name(item, False)
266
+ for result in self._results:
267
+ if item in result.could_be_from_charset:
268
+ return result
269
+ raise KeyError
270
+
271
+ def __len__(self) -> int:
272
+ return len(self._results)
273
+
274
+ def __bool__(self) -> bool:
275
+ return len(self._results) > 0
276
+
277
+ def append(self, item: CharsetMatch) -> None:
278
+ """
279
+ Insert a single match. Will be inserted accordingly to preserve sort.
280
+ Can be inserted as a submatch.
281
+ """
282
+ if not isinstance(item, CharsetMatch):
283
+ raise ValueError(
284
+ "Cannot append instance '{}' to CharsetMatches".format(
285
+ str(item.__class__)
286
+ )
287
+ )
288
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289
+ if len(item.raw) < TOO_BIG_SEQUENCE:
290
+ for match in self._results:
291
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292
+ match.add_submatch(item)
293
+ return
294
+ self._results.append(item)
295
+ self._results = sorted(self._results)
296
+
297
+ def best(self) -> CharsetMatch | None:
298
+ """
299
+ Simply return the first match. Strict equivalent to matches[0].
300
+ """
301
+ if not self._results:
302
+ return None
303
+ return self._results[0]
304
+
305
+ def first(self) -> CharsetMatch | None:
306
+ """
307
+ Redundant method, call the method best(). Kept for BC reasons.
308
+ """
309
+ return self.best()
310
+
311
+
312
+ CoherenceMatch = Tuple[str, float]
313
+ CoherenceMatches = List[CoherenceMatch]
314
+
315
+
316
+ class CliDetectionResult:
317
+ def __init__(
318
+ self,
319
+ path: str,
320
+ encoding: str | None,
321
+ encoding_aliases: list[str],
322
+ alternative_encodings: list[str],
323
+ language: str,
324
+ alphabets: list[str],
325
+ has_sig_or_bom: bool,
326
+ chaos: float,
327
+ coherence: float,
328
+ unicode_path: str | None,
329
+ is_preferred: bool,
330
+ ):
331
+ self.path: str = path
332
+ self.unicode_path: str | None = unicode_path
333
+ self.encoding: str | None = encoding
334
+ self.encoding_aliases: list[str] = encoding_aliases
335
+ self.alternative_encodings: list[str] = alternative_encodings
336
+ self.language: str = language
337
+ self.alphabets: list[str] = alphabets
338
+ self.has_sig_or_bom: bool = has_sig_or_bom
339
+ self.chaos: float = chaos
340
+ self.coherence: float = coherence
341
+ self.is_preferred: bool = is_preferred
342
+
343
+ @property
344
+ def __dict__(self) -> dict[str, Any]: # type: ignore
345
+ return {
346
+ "path": self.path,
347
+ "encoding": self.encoding,
348
+ "encoding_aliases": self.encoding_aliases,
349
+ "alternative_encodings": self.alternative_encodings,
350
+ "language": self.language,
351
+ "alphabets": self.alphabets,
352
+ "has_sig_or_bom": self.has_sig_or_bom,
353
+ "chaos": self.chaos,
354
+ "coherence": self.coherence,
355
+ "unicode_path": self.unicode_path,
356
+ "is_preferred": self.is_preferred,
357
+ }
358
+
359
+ def to_json(self) -> str:
360
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
venv/lib/python3.13/site-packages/charset_normalizer/py.typed ADDED
File without changes
venv/lib/python3.13/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import unicodedata
6
+ from codecs import IncrementalDecoder
7
+ from encodings.aliases import aliases
8
+ from functools import lru_cache
9
+ from re import findall
10
+ from typing import Generator
11
+
12
+ from _multibytecodec import ( # type: ignore[import-not-found,import]
13
+ MultibyteIncrementalDecoder,
14
+ )
15
+
16
+ from .constant import (
17
+ ENCODING_MARKS,
18
+ IANA_SUPPORTED_SIMILAR,
19
+ RE_POSSIBLE_ENCODING_INDICATION,
20
+ UNICODE_RANGES_COMBINED,
21
+ UNICODE_SECONDARY_RANGE_KEYWORD,
22
+ UTF8_MAXIMAL_ALLOCATION,
23
+ COMMON_CJK_CHARACTERS,
24
+ )
25
+
26
+
27
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
28
+ def is_accentuated(character: str) -> bool:
29
+ try:
30
+ description: str = unicodedata.name(character)
31
+ except ValueError: # Defensive: unicode database outdated?
32
+ return False
33
+ return (
34
+ "WITH GRAVE" in description
35
+ or "WITH ACUTE" in description
36
+ or "WITH CEDILLA" in description
37
+ or "WITH DIAERESIS" in description
38
+ or "WITH CIRCUMFLEX" in description
39
+ or "WITH TILDE" in description
40
+ or "WITH MACRON" in description
41
+ or "WITH RING ABOVE" in description
42
+ )
43
+
44
+
45
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
46
+ def remove_accent(character: str) -> str:
47
+ decomposed: str = unicodedata.decomposition(character)
48
+ if not decomposed:
49
+ return character
50
+
51
+ codes: list[str] = decomposed.split(" ")
52
+
53
+ return chr(int(codes[0], 16))
54
+
55
+
56
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
57
+ def unicode_range(character: str) -> str | None:
58
+ """
59
+ Retrieve the Unicode range official name from a single character.
60
+ """
61
+ character_ord: int = ord(character)
62
+
63
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
64
+ if character_ord in ord_range:
65
+ return range_name
66
+
67
+ return None
68
+
69
+
70
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
71
+ def is_latin(character: str) -> bool:
72
+ try:
73
+ description: str = unicodedata.name(character)
74
+ except ValueError: # Defensive: unicode database outdated?
75
+ return False
76
+ return "LATIN" in description
77
+
78
+
79
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
80
+ def is_punctuation(character: str) -> bool:
81
+ character_category: str = unicodedata.category(character)
82
+
83
+ if "P" in character_category:
84
+ return True
85
+
86
+ character_range: str | None = unicode_range(character)
87
+
88
+ if character_range is None:
89
+ return False
90
+
91
+ return "Punctuation" in character_range
92
+
93
+
94
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
95
+ def is_symbol(character: str) -> bool:
96
+ character_category: str = unicodedata.category(character)
97
+
98
+ if "S" in character_category or "N" in character_category:
99
+ return True
100
+
101
+ character_range: str | None = unicode_range(character)
102
+
103
+ if character_range is None:
104
+ return False
105
+
106
+ return "Forms" in character_range and character_category != "Lo"
107
+
108
+
109
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
110
+ def is_emoticon(character: str) -> bool:
111
+ character_range: str | None = unicode_range(character)
112
+
113
+ if character_range is None:
114
+ return False
115
+
116
+ return "Emoticons" in character_range or "Pictographs" in character_range
117
+
118
+
119
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
120
+ def is_separator(character: str) -> bool:
121
+ if character.isspace() or character in {"|", "+", "<", ">"}:
122
+ return True
123
+
124
+ character_category: str = unicodedata.category(character)
125
+
126
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
127
+
128
+
129
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
130
+ def is_case_variable(character: str) -> bool:
131
+ return character.islower() != character.isupper()
132
+
133
+
134
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
135
+ def is_cjk(character: str) -> bool:
136
+ try:
137
+ character_name = unicodedata.name(character)
138
+ except ValueError: # Defensive: unicode database outdated?
139
+ return False
140
+
141
+ return "CJK" in character_name
142
+
143
+
144
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
145
+ def is_hiragana(character: str) -> bool:
146
+ try:
147
+ character_name = unicodedata.name(character)
148
+ except ValueError: # Defensive: unicode database outdated?
149
+ return False
150
+
151
+ return "HIRAGANA" in character_name
152
+
153
+
154
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
155
+ def is_katakana(character: str) -> bool:
156
+ try:
157
+ character_name = unicodedata.name(character)
158
+ except ValueError: # Defensive: unicode database outdated?
159
+ return False
160
+
161
+ return "KATAKANA" in character_name
162
+
163
+
164
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
165
+ def is_hangul(character: str) -> bool:
166
+ try:
167
+ character_name = unicodedata.name(character)
168
+ except ValueError: # Defensive: unicode database outdated?
169
+ return False
170
+
171
+ return "HANGUL" in character_name
172
+
173
+
174
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
175
+ def is_thai(character: str) -> bool:
176
+ try:
177
+ character_name = unicodedata.name(character)
178
+ except ValueError: # Defensive: unicode database outdated?
179
+ return False
180
+
181
+ return "THAI" in character_name
182
+
183
+
184
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
185
+ def is_arabic(character: str) -> bool:
186
+ try:
187
+ character_name = unicodedata.name(character)
188
+ except ValueError: # Defensive: unicode database outdated?
189
+ return False
190
+
191
+ return "ARABIC" in character_name
192
+
193
+
194
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
195
+ def is_arabic_isolated_form(character: str) -> bool:
196
+ try:
197
+ character_name = unicodedata.name(character)
198
+ except ValueError: # Defensive: unicode database outdated?
199
+ return False
200
+
201
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
202
+
203
+
204
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
205
+ def is_cjk_uncommon(character: str) -> bool:
206
+ return character not in COMMON_CJK_CHARACTERS
207
+
208
+
209
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
210
+ def is_unicode_range_secondary(range_name: str) -> bool:
211
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
212
+
213
+
214
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
215
+ def is_unprintable(character: str) -> bool:
216
+ return (
217
+ character.isspace() is False # includes \n \t \r \v
218
+ and character.isprintable() is False
219
+ and character != "\x1a" # Why? Its the ASCII substitute character.
220
+ and character != "\ufeff" # bug discovered in Python,
221
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
222
+ )
223
+
224
+
225
+ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
226
+ """
227
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
228
+ """
229
+ if not isinstance(sequence, bytes):
230
+ raise TypeError
231
+
232
+ seq_len: int = len(sequence)
233
+
234
+ results: list[str] = findall(
235
+ RE_POSSIBLE_ENCODING_INDICATION,
236
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
237
+ )
238
+
239
+ if len(results) == 0:
240
+ return None
241
+
242
+ for specified_encoding in results:
243
+ specified_encoding = specified_encoding.lower().replace("-", "_")
244
+
245
+ encoding_alias: str
246
+ encoding_iana: str
247
+
248
+ for encoding_alias, encoding_iana in aliases.items():
249
+ if encoding_alias == specified_encoding:
250
+ return encoding_iana
251
+ if encoding_iana == specified_encoding:
252
+ return encoding_iana
253
+
254
+ return None
255
+
256
+
257
+ @lru_cache(maxsize=128)
258
+ def is_multi_byte_encoding(name: str) -> bool:
259
+ """
260
+ Verify is a specific encoding is a multi byte one based on it IANA name
261
+ """
262
+ return name in {
263
+ "utf_8",
264
+ "utf_8_sig",
265
+ "utf_16",
266
+ "utf_16_be",
267
+ "utf_16_le",
268
+ "utf_32",
269
+ "utf_32_le",
270
+ "utf_32_be",
271
+ "utf_7",
272
+ } or issubclass(
273
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
274
+ MultibyteIncrementalDecoder,
275
+ )
276
+
277
+
278
+ def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
279
+ """
280
+ Identify and extract SIG/BOM in given sequence.
281
+ """
282
+
283
+ for iana_encoding in ENCODING_MARKS:
284
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
285
+
286
+ if isinstance(marks, bytes):
287
+ marks = [marks]
288
+
289
+ for mark in marks:
290
+ if sequence.startswith(mark):
291
+ return iana_encoding, mark
292
+
293
+ return None, b""
294
+
295
+
296
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
297
+ return iana_encoding not in {"utf_16", "utf_32"}
298
+
299
+
300
+ def iana_name(cp_name: str, strict: bool = True) -> str:
301
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
302
+ cp_name = cp_name.lower().replace("-", "_")
303
+
304
+ encoding_alias: str
305
+ encoding_iana: str
306
+
307
+ for encoding_alias, encoding_iana in aliases.items():
308
+ if cp_name in [encoding_alias, encoding_iana]:
309
+ return encoding_iana
310
+
311
+ if strict:
312
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
313
+
314
+ return cp_name
315
+
316
+
317
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
318
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
319
+ return 0.0
320
+
321
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
322
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
323
+
324
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
325
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
326
+
327
+ character_match_count: int = 0
328
+
329
+ for i in range(255):
330
+ to_be_decoded: bytes = bytes([i])
331
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
332
+ character_match_count += 1
333
+
334
+ return character_match_count / 254
335
+
336
+
337
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
338
+ """
339
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
340
+ the function cp_similarity.
341
+ """
342
+ return (
343
+ iana_name_a in IANA_SUPPORTED_SIMILAR
344
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
345
+ )
346
+
347
+
348
+ def set_logging_handler(
349
+ name: str = "charset_normalizer",
350
+ level: int = logging.INFO,
351
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
352
+ ) -> None:
353
+ logger = logging.getLogger(name)
354
+ logger.setLevel(level)
355
+
356
+ handler = logging.StreamHandler()
357
+ handler.setFormatter(logging.Formatter(format_string))
358
+ logger.addHandler(handler)
359
+
360
+
361
+ def cut_sequence_chunks(
362
+ sequences: bytes,
363
+ encoding_iana: str,
364
+ offsets: range,
365
+ chunk_size: int,
366
+ bom_or_sig_available: bool,
367
+ strip_sig_or_bom: bool,
368
+ sig_payload: bytes,
369
+ is_multi_byte_decoder: bool,
370
+ decoded_payload: str | None = None,
371
+ ) -> Generator[str, None, None]:
372
+ if decoded_payload and is_multi_byte_decoder is False:
373
+ for i in offsets:
374
+ chunk = decoded_payload[i : i + chunk_size]
375
+ if not chunk:
376
+ break
377
+ yield chunk
378
+ else:
379
+ for i in offsets:
380
+ chunk_end = i + chunk_size
381
+ if chunk_end > len(sequences) + 8:
382
+ continue
383
+
384
+ cut_sequence = sequences[i : i + chunk_size]
385
+
386
+ if bom_or_sig_available and strip_sig_or_bom is False:
387
+ cut_sequence = sig_payload + cut_sequence
388
+
389
+ chunk = cut_sequence.decode(
390
+ encoding_iana,
391
+ errors="ignore" if is_multi_byte_decoder else "strict",
392
+ )
393
+
394
+ # multi-byte bad cutting detector and adjustment
395
+ # not the cleanest way to perform that fix but clever enough for now.
396
+ if is_multi_byte_decoder and i > 0:
397
+ chunk_partial_size_chk: int = min(chunk_size, 16)
398
+
399
+ if (
400
+ decoded_payload
401
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
402
+ ):
403
+ for j in range(i, i - 4, -1):
404
+ cut_sequence = sequences[j:chunk_end]
405
+
406
+ if bom_or_sig_available and strip_sig_or_bom is False:
407
+ cut_sequence = sig_payload + cut_sequence
408
+
409
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
410
+
411
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
412
+ break
413
+
414
+ yield chunk
venv/lib/python3.13/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "3.4.4"
8
+ VERSION = __version__.split(".")
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/METADATA ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: filelock
3
+ Version: 3.20.0
4
+ Summary: A platform independent file lock.
5
+ Project-URL: Documentation, https://py-filelock.readthedocs.io
6
+ Project-URL: Homepage, https://github.com/tox-dev/py-filelock
7
+ Project-URL: Source, https://github.com/tox-dev/py-filelock
8
+ Project-URL: Tracker, https://github.com/tox-dev/py-filelock/issues
9
+ Maintainer-email: Bernát Gábor <gaborjbernat@gmail.com>
10
+ License-Expression: Unlicense
11
+ License-File: LICENSE
12
+ Keywords: application,cache,directory,log,user
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: The Unlicense (Unlicense)
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Internet
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: System
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown
29
+
30
+ # filelock
31
+
32
+ [![PyPI](https://img.shields.io/pypi/v/filelock)](https://pypi.org/project/filelock/)
33
+ [![Supported Python
34
+ versions](https://img.shields.io/pypi/pyversions/filelock.svg)](https://pypi.org/project/filelock/)
35
+ [![Documentation
36
+ status](https://readthedocs.org/projects/py-filelock/badge/?version=latest)](https://py-filelock.readthedocs.io/en/latest/?badge=latest)
37
+ [![Code style:
38
+ black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
39
+ [![Downloads](https://static.pepy.tech/badge/filelock/month)](https://pepy.tech/project/filelock)
40
+ [![check](https://github.com/tox-dev/py-filelock/actions/workflows/check.yaml/badge.svg)](https://github.com/tox-dev/py-filelock/actions/workflows/check.yaml)
41
+
42
+ For more information checkout the [official documentation](https://py-filelock.readthedocs.io/en/latest/index.html).
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/RECORD ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ filelock-3.20.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ filelock-3.20.0.dist-info/METADATA,sha256=gIghqdcbGNywxw52pN02_a9OxFqzhjA8v-9GsDWtNog,2110
3
+ filelock-3.20.0.dist-info/RECORD,,
4
+ filelock-3.20.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
+ filelock-3.20.0.dist-info/licenses/LICENSE,sha256=iNm062BXnBkew5HKBMFhMFctfu3EqG2qWL8oxuFMm80,1210
6
+ filelock/__init__.py,sha256=_t_-OAGXo_qyPa9lNQ1YnzVYEvSW3I0onPqzpomsVVg,1769
7
+ filelock/__pycache__/__init__.cpython-313.pyc,,
8
+ filelock/__pycache__/_api.cpython-313.pyc,,
9
+ filelock/__pycache__/_error.cpython-313.pyc,,
10
+ filelock/__pycache__/_soft.cpython-313.pyc,,
11
+ filelock/__pycache__/_unix.cpython-313.pyc,,
12
+ filelock/__pycache__/_util.cpython-313.pyc,,
13
+ filelock/__pycache__/_windows.cpython-313.pyc,,
14
+ filelock/__pycache__/asyncio.cpython-313.pyc,,
15
+ filelock/__pycache__/version.cpython-313.pyc,,
16
+ filelock/_api.py,sha256=2aATBeJ3-jtMj5OSm7EE539iNaTBsf13KXtcBMoi8oM,14545
17
+ filelock/_error.py,sha256=-5jMcjTu60YAvAO1UbqDD1GIEjVkwr8xCFwDBtMeYDg,787
18
+ filelock/_soft.py,sha256=haqtc_TB_KJbYv2a8iuEAclKuM4fMG1vTcp28sK919c,1711
19
+ filelock/_unix.py,sha256=eGOs4gDgZ-5fGnJUz-OkJDeZkAMzgvYcD8hVD6XH7e4,2351
20
+ filelock/_util.py,sha256=QHBoNFIYfbAThhotH3Q8E2acFc84wpG49-T-uu017ZE,1715
21
+ filelock/_windows.py,sha256=8k4XIBl_zZVfGC2gz0kEr8DZBvpNa8wdU9qeM1YrBb8,2179
22
+ filelock/asyncio.py,sha256=dSLe6XZSECFOgsVpcQUSh5Y5zAHxHGPu_tfpPX9I45k,12514
23
+ filelock/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ filelock/version.py,sha256=AW5MeEjK4TaQWWJrGb_AlBw8PlmFoIcn7GodG_AVSOM,706
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/WHEEL ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/METADATA ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: hf-xet
3
+ Version: 1.2.0
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: License :: OSI Approved :: Apache Software License
6
+ Classifier: Programming Language :: Rust
7
+ Classifier: Programming Language :: Python :: Implementation :: CPython
8
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3 :: Only
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Programming Language :: Python :: Free Threading
19
+ Classifier: Programming Language :: Python :: Free Threading :: 2 - Beta
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Dist: pytest ; extra == 'tests'
22
+ Provides-Extra: tests
23
+ License-File: LICENSE
24
+ Summary: Fast transfer of large files with the Hugging Face Hub.
25
+ Maintainer-email: Rajat Arya <rajat@rajatarya.com>, Jared Sulzdorf <j.sulzdorf@gmail.com>, Di Xiao <di@huggingface.co>, Assaf Vayner <assaf@huggingface.co>, Hoyt Koepke <hoytak@gmail.com>
26
+ License-Expression: Apache-2.0
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
29
+ Project-URL: Homepage, https://github.com/huggingface/xet-core
30
+ Project-URL: Documentation, https://huggingface.co/docs/hub/en/storage-backends#using-xet-storage
31
+ Project-URL: Issues, https://github.com/huggingface/xet-core/issues
32
+ Project-URL: Repository, https://github.com/huggingface/xet-core.git
33
+
34
+ <!---
35
+ Copyright 2024 The HuggingFace Team. All rights reserved.
36
+
37
+ Licensed under the Apache License, Version 2.0 (the "License");
38
+ you may not use this file except in compliance with the License.
39
+ You may obtain a copy of the License at
40
+
41
+ http://www.apache.org/licenses/LICENSE-2.0
42
+
43
+ Unless required by applicable law or agreed to in writing, software
44
+ distributed under the License is distributed on an "AS IS" BASIS,
45
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
46
+ See the License for the specific language governing permissions and
47
+ limitations under the License.
48
+ -->
49
+ <p align="center">
50
+ <a href="https://github.com/huggingface/xet-core/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/huggingface/xet-core.svg?color=blue"></a>
51
+ <a href="https://github.com/huggingface/xet-core/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/xet-core.svg"></a>
52
+ <a href="https://github.com/huggingface/xet-core/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
53
+ </p>
54
+
55
+ <h3 align="center">
56
+ <p>🤗 hf-xet - xet client tech, used in <a target="_blank" href="https://github.com/huggingface/huggingface_hub/">huggingface_hub</a></p>
57
+ </h3>
58
+
59
+ ## Welcome
60
+
61
+ `hf-xet` enables `huggingface_hub` to utilize xet storage for uploading and downloading to HF Hub. Xet storage provides chunk-based deduplication, efficient storage/retrieval with local disk caching, and backwards compatibility with Git LFS. This library is not meant to be used directly, and is instead intended to be used from [huggingface_hub](https://pypi.org/project/huggingface-hub).
62
+
63
+ ## Key features
64
+
65
+ ♻ **chunk-based deduplication implementation**: avoid transferring and storing chunks that are shared across binary files (models, datasets, etc).
66
+
67
+ 🤗 **Python bindings**: bindings for [huggingface_hub](https://github.com/huggingface/huggingface_hub/) package.
68
+
69
+ ↔ **network communications**: concurrent communication to HF Hub Xet backend services (CAS).
70
+
71
+ 🔖 **local disk caching**: chunk-based cache that sits alongside the existing [huggingface_hub disk cache](https://huggingface.co/docs/huggingface_hub/guides/manage-cache).
72
+
73
+ ## Installation
74
+
75
+ Install the `hf_xet` package with [pip](https://pypi.org/project/hf-xet/):
76
+
77
+ ```bash
78
+ pip install hf_xet
79
+ ```
80
+
81
+ ## Quick Start
82
+
83
+ `hf_xet` is not intended to be run independently as it is expected to be used from `huggingface_hub`, so to get started with `huggingface_hub` check out the documentation [here]("https://hf.co/docs/huggingface_hub").
84
+
85
+ ## Contributions (feature requests, bugs, etc.) are encouraged & appreciated 💙💚💛💜🧡❤️
86
+
87
+ Please join us in making hf-xet better. We value everyone's contributions. Code is not the only way to help. Answering questions, helping each other, improving documentation, filing issues all help immensely. If you are interested in contributing (please do!), check out the [contribution guide](https://github.com/huggingface/xet-core/blob/main/CONTRIBUTING.md) for this repository.
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/RECORD ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ hf_xet-1.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ hf_xet-1.2.0.dist-info/METADATA,sha256=U-3J7DnI-UycsH-OPV_q2_s3jhtJSkQYifQ03yS9ie8,4910
3
+ hf_xet-1.2.0.dist-info/RECORD,,
4
+ hf_xet-1.2.0.dist-info/WHEEL,sha256=W1f4mZCUZH4n5LoWwHgwGsB1zJCLLADdZ7x6Gd7Z8X8,127
5
+ hf_xet-1.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
+ hf_xet/__init__.py,sha256=E8UDdyQ8glZ_nve9hHEf22bPang8-RKx4VuApXYeQUo,107
7
+ hf_xet/__pycache__/__init__.cpython-313.pyc,,
8
+ hf_xet/hf_xet.abi3.so,sha256=vddURwHuQEUiJXuQlm1NZ47kOW5ck53KFdD32s2mDmY,8310504
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/WHEEL ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.9.6)
3
+ Root-Is-Purelib: false
4
+ Tag: cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64
venv/lib/python3.13/site-packages/huggingface_hub/__init__.py ADDED
@@ -0,0 +1,1554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # ***********
16
+ # `huggingface_hub` init has 2 modes:
17
+ # - Normal usage:
18
+ # If imported to use it, all modules and functions are lazy-loaded. This means
19
+ # they exist at top level in module but are imported only the first time they are
20
+ # used. This way, `from huggingface_hub import something` will import `something`
21
+ # quickly without the hassle of importing all the features from `huggingface_hub`.
22
+ # - Static check:
23
+ # If statically analyzed, all modules and functions are loaded normally. This way
24
+ # static typing check works properly as well as autocomplete in text editors and
25
+ # IDEs.
26
+ #
27
+ # The static model imports are done inside the `if TYPE_CHECKING:` statement at
28
+ # the bottom of this file. Since module/functions imports are duplicated, it is
29
+ # mandatory to make sure to add them twice when adding one. This is checked in the
30
+ # `make quality` command.
31
+ #
32
+ # To update the static imports, please run the following command and commit the changes.
33
+ # ```
34
+ # # Use script
35
+ # python utils/check_static_imports.py --update-file
36
+ #
37
+ # # Or run style on codebase
38
+ # make style
39
+ # ```
40
+ #
41
+ # ***********
42
+ # Lazy loader vendored from https://github.com/scientific-python/lazy_loader
43
+ import importlib
44
+ import os
45
+ import sys
46
+ from typing import TYPE_CHECKING
47
+
48
+
49
+ __version__ = "0.36.0"
50
+
51
+ # Alphabetical order of definitions is ensured in tests
52
+ # WARNING: any comment added in this dictionary definition will be lost when
53
+ # re-generating the file !
54
+ _SUBMOD_ATTRS = {
55
+ "_commit_scheduler": [
56
+ "CommitScheduler",
57
+ ],
58
+ "_inference_endpoints": [
59
+ "InferenceEndpoint",
60
+ "InferenceEndpointError",
61
+ "InferenceEndpointStatus",
62
+ "InferenceEndpointTimeoutError",
63
+ "InferenceEndpointType",
64
+ ],
65
+ "_jobs_api": [
66
+ "JobInfo",
67
+ "JobOwner",
68
+ "JobStage",
69
+ "JobStatus",
70
+ ],
71
+ "_login": [
72
+ "auth_list",
73
+ "auth_switch",
74
+ "interpreter_login",
75
+ "login",
76
+ "logout",
77
+ "notebook_login",
78
+ ],
79
+ "_oauth": [
80
+ "OAuthInfo",
81
+ "OAuthOrgInfo",
82
+ "OAuthUserInfo",
83
+ "attach_huggingface_oauth",
84
+ "parse_huggingface_oauth",
85
+ ],
86
+ "_snapshot_download": [
87
+ "snapshot_download",
88
+ ],
89
+ "_space_api": [
90
+ "SpaceHardware",
91
+ "SpaceRuntime",
92
+ "SpaceStage",
93
+ "SpaceStorage",
94
+ "SpaceVariable",
95
+ ],
96
+ "_tensorboard_logger": [
97
+ "HFSummaryWriter",
98
+ ],
99
+ "_webhooks_payload": [
100
+ "WebhookPayload",
101
+ "WebhookPayloadComment",
102
+ "WebhookPayloadDiscussion",
103
+ "WebhookPayloadDiscussionChanges",
104
+ "WebhookPayloadEvent",
105
+ "WebhookPayloadMovedTo",
106
+ "WebhookPayloadRepo",
107
+ "WebhookPayloadUrl",
108
+ "WebhookPayloadWebhook",
109
+ ],
110
+ "_webhooks_server": [
111
+ "WebhooksServer",
112
+ "webhook_endpoint",
113
+ ],
114
+ "community": [
115
+ "Discussion",
116
+ "DiscussionComment",
117
+ "DiscussionCommit",
118
+ "DiscussionEvent",
119
+ "DiscussionStatusChange",
120
+ "DiscussionTitleChange",
121
+ "DiscussionWithDetails",
122
+ ],
123
+ "constants": [
124
+ "CONFIG_NAME",
125
+ "FLAX_WEIGHTS_NAME",
126
+ "HUGGINGFACE_CO_URL_HOME",
127
+ "HUGGINGFACE_CO_URL_TEMPLATE",
128
+ "PYTORCH_WEIGHTS_NAME",
129
+ "REPO_TYPE_DATASET",
130
+ "REPO_TYPE_MODEL",
131
+ "REPO_TYPE_SPACE",
132
+ "TF2_WEIGHTS_NAME",
133
+ "TF_WEIGHTS_NAME",
134
+ ],
135
+ "fastai_utils": [
136
+ "_save_pretrained_fastai",
137
+ "from_pretrained_fastai",
138
+ "push_to_hub_fastai",
139
+ ],
140
+ "file_download": [
141
+ "HfFileMetadata",
142
+ "_CACHED_NO_EXIST",
143
+ "get_hf_file_metadata",
144
+ "hf_hub_download",
145
+ "hf_hub_url",
146
+ "try_to_load_from_cache",
147
+ ],
148
+ "hf_api": [
149
+ "Collection",
150
+ "CollectionItem",
151
+ "CommitInfo",
152
+ "CommitOperation",
153
+ "CommitOperationAdd",
154
+ "CommitOperationCopy",
155
+ "CommitOperationDelete",
156
+ "DatasetInfo",
157
+ "GitCommitInfo",
158
+ "GitRefInfo",
159
+ "GitRefs",
160
+ "HfApi",
161
+ "ModelInfo",
162
+ "Organization",
163
+ "RepoUrl",
164
+ "SpaceInfo",
165
+ "User",
166
+ "UserLikes",
167
+ "WebhookInfo",
168
+ "WebhookWatchedItem",
169
+ "accept_access_request",
170
+ "add_collection_item",
171
+ "add_space_secret",
172
+ "add_space_variable",
173
+ "auth_check",
174
+ "cancel_access_request",
175
+ "cancel_job",
176
+ "change_discussion_status",
177
+ "comment_discussion",
178
+ "create_branch",
179
+ "create_collection",
180
+ "create_commit",
181
+ "create_discussion",
182
+ "create_inference_endpoint",
183
+ "create_inference_endpoint_from_catalog",
184
+ "create_pull_request",
185
+ "create_repo",
186
+ "create_scheduled_job",
187
+ "create_scheduled_uv_job",
188
+ "create_tag",
189
+ "create_webhook",
190
+ "dataset_info",
191
+ "delete_branch",
192
+ "delete_collection",
193
+ "delete_collection_item",
194
+ "delete_file",
195
+ "delete_folder",
196
+ "delete_inference_endpoint",
197
+ "delete_repo",
198
+ "delete_scheduled_job",
199
+ "delete_space_secret",
200
+ "delete_space_storage",
201
+ "delete_space_variable",
202
+ "delete_tag",
203
+ "delete_webhook",
204
+ "disable_webhook",
205
+ "duplicate_space",
206
+ "edit_discussion_comment",
207
+ "enable_webhook",
208
+ "fetch_job_logs",
209
+ "file_exists",
210
+ "get_collection",
211
+ "get_dataset_tags",
212
+ "get_discussion_details",
213
+ "get_full_repo_name",
214
+ "get_inference_endpoint",
215
+ "get_model_tags",
216
+ "get_organization_overview",
217
+ "get_paths_info",
218
+ "get_repo_discussions",
219
+ "get_safetensors_metadata",
220
+ "get_space_runtime",
221
+ "get_space_variables",
222
+ "get_token_permission",
223
+ "get_user_overview",
224
+ "get_webhook",
225
+ "grant_access",
226
+ "inspect_job",
227
+ "inspect_scheduled_job",
228
+ "list_accepted_access_requests",
229
+ "list_collections",
230
+ "list_datasets",
231
+ "list_inference_catalog",
232
+ "list_inference_endpoints",
233
+ "list_jobs",
234
+ "list_lfs_files",
235
+ "list_liked_repos",
236
+ "list_models",
237
+ "list_organization_members",
238
+ "list_papers",
239
+ "list_pending_access_requests",
240
+ "list_rejected_access_requests",
241
+ "list_repo_commits",
242
+ "list_repo_files",
243
+ "list_repo_likers",
244
+ "list_repo_refs",
245
+ "list_repo_tree",
246
+ "list_spaces",
247
+ "list_user_followers",
248
+ "list_user_following",
249
+ "list_webhooks",
250
+ "merge_pull_request",
251
+ "model_info",
252
+ "move_repo",
253
+ "paper_info",
254
+ "parse_safetensors_file_metadata",
255
+ "pause_inference_endpoint",
256
+ "pause_space",
257
+ "permanently_delete_lfs_files",
258
+ "preupload_lfs_files",
259
+ "reject_access_request",
260
+ "rename_discussion",
261
+ "repo_exists",
262
+ "repo_info",
263
+ "repo_type_and_id_from_hf_id",
264
+ "request_space_hardware",
265
+ "request_space_storage",
266
+ "restart_space",
267
+ "resume_inference_endpoint",
268
+ "resume_scheduled_job",
269
+ "revision_exists",
270
+ "run_as_future",
271
+ "run_job",
272
+ "run_uv_job",
273
+ "scale_to_zero_inference_endpoint",
274
+ "set_space_sleep_time",
275
+ "space_info",
276
+ "super_squash_history",
277
+ "suspend_scheduled_job",
278
+ "unlike",
279
+ "update_collection_item",
280
+ "update_collection_metadata",
281
+ "update_inference_endpoint",
282
+ "update_repo_settings",
283
+ "update_repo_visibility",
284
+ "update_webhook",
285
+ "upload_file",
286
+ "upload_folder",
287
+ "upload_large_folder",
288
+ "whoami",
289
+ ],
290
+ "hf_file_system": [
291
+ "HfFileSystem",
292
+ "HfFileSystemFile",
293
+ "HfFileSystemResolvedPath",
294
+ "HfFileSystemStreamFile",
295
+ ],
296
+ "hub_mixin": [
297
+ "ModelHubMixin",
298
+ "PyTorchModelHubMixin",
299
+ ],
300
+ "inference._client": [
301
+ "InferenceClient",
302
+ "InferenceTimeoutError",
303
+ ],
304
+ "inference._generated._async_client": [
305
+ "AsyncInferenceClient",
306
+ ],
307
+ "inference._generated.types": [
308
+ "AudioClassificationInput",
309
+ "AudioClassificationOutputElement",
310
+ "AudioClassificationOutputTransform",
311
+ "AudioClassificationParameters",
312
+ "AudioToAudioInput",
313
+ "AudioToAudioOutputElement",
314
+ "AutomaticSpeechRecognitionEarlyStoppingEnum",
315
+ "AutomaticSpeechRecognitionGenerationParameters",
316
+ "AutomaticSpeechRecognitionInput",
317
+ "AutomaticSpeechRecognitionOutput",
318
+ "AutomaticSpeechRecognitionOutputChunk",
319
+ "AutomaticSpeechRecognitionParameters",
320
+ "ChatCompletionInput",
321
+ "ChatCompletionInputFunctionDefinition",
322
+ "ChatCompletionInputFunctionName",
323
+ "ChatCompletionInputGrammarType",
324
+ "ChatCompletionInputJSONSchema",
325
+ "ChatCompletionInputMessage",
326
+ "ChatCompletionInputMessageChunk",
327
+ "ChatCompletionInputMessageChunkType",
328
+ "ChatCompletionInputResponseFormatJSONObject",
329
+ "ChatCompletionInputResponseFormatJSONSchema",
330
+ "ChatCompletionInputResponseFormatText",
331
+ "ChatCompletionInputStreamOptions",
332
+ "ChatCompletionInputTool",
333
+ "ChatCompletionInputToolCall",
334
+ "ChatCompletionInputToolChoiceClass",
335
+ "ChatCompletionInputToolChoiceEnum",
336
+ "ChatCompletionInputURL",
337
+ "ChatCompletionOutput",
338
+ "ChatCompletionOutputComplete",
339
+ "ChatCompletionOutputFunctionDefinition",
340
+ "ChatCompletionOutputLogprob",
341
+ "ChatCompletionOutputLogprobs",
342
+ "ChatCompletionOutputMessage",
343
+ "ChatCompletionOutputToolCall",
344
+ "ChatCompletionOutputTopLogprob",
345
+ "ChatCompletionOutputUsage",
346
+ "ChatCompletionStreamOutput",
347
+ "ChatCompletionStreamOutputChoice",
348
+ "ChatCompletionStreamOutputDelta",
349
+ "ChatCompletionStreamOutputDeltaToolCall",
350
+ "ChatCompletionStreamOutputFunction",
351
+ "ChatCompletionStreamOutputLogprob",
352
+ "ChatCompletionStreamOutputLogprobs",
353
+ "ChatCompletionStreamOutputTopLogprob",
354
+ "ChatCompletionStreamOutputUsage",
355
+ "DepthEstimationInput",
356
+ "DepthEstimationOutput",
357
+ "DocumentQuestionAnsweringInput",
358
+ "DocumentQuestionAnsweringInputData",
359
+ "DocumentQuestionAnsweringOutputElement",
360
+ "DocumentQuestionAnsweringParameters",
361
+ "FeatureExtractionInput",
362
+ "FeatureExtractionInputTruncationDirection",
363
+ "FillMaskInput",
364
+ "FillMaskOutputElement",
365
+ "FillMaskParameters",
366
+ "ImageClassificationInput",
367
+ "ImageClassificationOutputElement",
368
+ "ImageClassificationOutputTransform",
369
+ "ImageClassificationParameters",
370
+ "ImageSegmentationInput",
371
+ "ImageSegmentationOutputElement",
372
+ "ImageSegmentationParameters",
373
+ "ImageSegmentationSubtask",
374
+ "ImageToImageInput",
375
+ "ImageToImageOutput",
376
+ "ImageToImageParameters",
377
+ "ImageToImageTargetSize",
378
+ "ImageToTextEarlyStoppingEnum",
379
+ "ImageToTextGenerationParameters",
380
+ "ImageToTextInput",
381
+ "ImageToTextOutput",
382
+ "ImageToTextParameters",
383
+ "ImageToVideoInput",
384
+ "ImageToVideoOutput",
385
+ "ImageToVideoParameters",
386
+ "ImageToVideoTargetSize",
387
+ "ObjectDetectionBoundingBox",
388
+ "ObjectDetectionInput",
389
+ "ObjectDetectionOutputElement",
390
+ "ObjectDetectionParameters",
391
+ "Padding",
392
+ "QuestionAnsweringInput",
393
+ "QuestionAnsweringInputData",
394
+ "QuestionAnsweringOutputElement",
395
+ "QuestionAnsweringParameters",
396
+ "SentenceSimilarityInput",
397
+ "SentenceSimilarityInputData",
398
+ "SummarizationInput",
399
+ "SummarizationOutput",
400
+ "SummarizationParameters",
401
+ "SummarizationTruncationStrategy",
402
+ "TableQuestionAnsweringInput",
403
+ "TableQuestionAnsweringInputData",
404
+ "TableQuestionAnsweringOutputElement",
405
+ "TableQuestionAnsweringParameters",
406
+ "Text2TextGenerationInput",
407
+ "Text2TextGenerationOutput",
408
+ "Text2TextGenerationParameters",
409
+ "Text2TextGenerationTruncationStrategy",
410
+ "TextClassificationInput",
411
+ "TextClassificationOutputElement",
412
+ "TextClassificationOutputTransform",
413
+ "TextClassificationParameters",
414
+ "TextGenerationInput",
415
+ "TextGenerationInputGenerateParameters",
416
+ "TextGenerationInputGrammarType",
417
+ "TextGenerationOutput",
418
+ "TextGenerationOutputBestOfSequence",
419
+ "TextGenerationOutputDetails",
420
+ "TextGenerationOutputFinishReason",
421
+ "TextGenerationOutputPrefillToken",
422
+ "TextGenerationOutputToken",
423
+ "TextGenerationStreamOutput",
424
+ "TextGenerationStreamOutputStreamDetails",
425
+ "TextGenerationStreamOutputToken",
426
+ "TextToAudioEarlyStoppingEnum",
427
+ "TextToAudioGenerationParameters",
428
+ "TextToAudioInput",
429
+ "TextToAudioOutput",
430
+ "TextToAudioParameters",
431
+ "TextToImageInput",
432
+ "TextToImageOutput",
433
+ "TextToImageParameters",
434
+ "TextToSpeechEarlyStoppingEnum",
435
+ "TextToSpeechGenerationParameters",
436
+ "TextToSpeechInput",
437
+ "TextToSpeechOutput",
438
+ "TextToSpeechParameters",
439
+ "TextToVideoInput",
440
+ "TextToVideoOutput",
441
+ "TextToVideoParameters",
442
+ "TokenClassificationAggregationStrategy",
443
+ "TokenClassificationInput",
444
+ "TokenClassificationOutputElement",
445
+ "TokenClassificationParameters",
446
+ "TranslationInput",
447
+ "TranslationOutput",
448
+ "TranslationParameters",
449
+ "TranslationTruncationStrategy",
450
+ "TypeEnum",
451
+ "VideoClassificationInput",
452
+ "VideoClassificationOutputElement",
453
+ "VideoClassificationOutputTransform",
454
+ "VideoClassificationParameters",
455
+ "VisualQuestionAnsweringInput",
456
+ "VisualQuestionAnsweringInputData",
457
+ "VisualQuestionAnsweringOutputElement",
458
+ "VisualQuestionAnsweringParameters",
459
+ "ZeroShotClassificationInput",
460
+ "ZeroShotClassificationOutputElement",
461
+ "ZeroShotClassificationParameters",
462
+ "ZeroShotImageClassificationInput",
463
+ "ZeroShotImageClassificationOutputElement",
464
+ "ZeroShotImageClassificationParameters",
465
+ "ZeroShotObjectDetectionBoundingBox",
466
+ "ZeroShotObjectDetectionInput",
467
+ "ZeroShotObjectDetectionOutputElement",
468
+ "ZeroShotObjectDetectionParameters",
469
+ ],
470
+ "inference._mcp.agent": [
471
+ "Agent",
472
+ ],
473
+ "inference._mcp.mcp_client": [
474
+ "MCPClient",
475
+ ],
476
+ "inference_api": [
477
+ "InferenceApi",
478
+ ],
479
+ "keras_mixin": [
480
+ "KerasModelHubMixin",
481
+ "from_pretrained_keras",
482
+ "push_to_hub_keras",
483
+ "save_pretrained_keras",
484
+ ],
485
+ "repocard": [
486
+ "DatasetCard",
487
+ "ModelCard",
488
+ "RepoCard",
489
+ "SpaceCard",
490
+ "metadata_eval_result",
491
+ "metadata_load",
492
+ "metadata_save",
493
+ "metadata_update",
494
+ ],
495
+ "repocard_data": [
496
+ "CardData",
497
+ "DatasetCardData",
498
+ "EvalResult",
499
+ "ModelCardData",
500
+ "SpaceCardData",
501
+ ],
502
+ "repository": [
503
+ "Repository",
504
+ ],
505
+ "serialization": [
506
+ "StateDictSplit",
507
+ "get_tf_storage_size",
508
+ "get_torch_storage_id",
509
+ "get_torch_storage_size",
510
+ "load_state_dict_from_file",
511
+ "load_torch_model",
512
+ "save_torch_model",
513
+ "save_torch_state_dict",
514
+ "split_state_dict_into_shards_factory",
515
+ "split_tf_state_dict_into_shards",
516
+ "split_torch_state_dict_into_shards",
517
+ ],
518
+ "serialization._dduf": [
519
+ "DDUFEntry",
520
+ "export_entries_as_dduf",
521
+ "export_folder_as_dduf",
522
+ "read_dduf_file",
523
+ ],
524
+ "utils": [
525
+ "CacheNotFound",
526
+ "CachedFileInfo",
527
+ "CachedRepoInfo",
528
+ "CachedRevisionInfo",
529
+ "CorruptedCacheException",
530
+ "DeleteCacheStrategy",
531
+ "HFCacheInfo",
532
+ "HfFolder",
533
+ "cached_assets_path",
534
+ "configure_http_backend",
535
+ "dump_environment_info",
536
+ "get_session",
537
+ "get_token",
538
+ "logging",
539
+ "scan_cache_dir",
540
+ ],
541
+ }
542
+
543
+ # WARNING: __all__ is generated automatically, Any manual edit will be lost when re-generating this file !
544
+ #
545
+ # To update the static imports, please run the following command and commit the changes.
546
+ # ```
547
+ # # Use script
548
+ # python utils/check_all_variable.py --update
549
+ #
550
+ # # Or run style on codebase
551
+ # make style
552
+ # ```
553
+
554
+ __all__ = [
555
+ "Agent",
556
+ "AsyncInferenceClient",
557
+ "AudioClassificationInput",
558
+ "AudioClassificationOutputElement",
559
+ "AudioClassificationOutputTransform",
560
+ "AudioClassificationParameters",
561
+ "AudioToAudioInput",
562
+ "AudioToAudioOutputElement",
563
+ "AutomaticSpeechRecognitionEarlyStoppingEnum",
564
+ "AutomaticSpeechRecognitionGenerationParameters",
565
+ "AutomaticSpeechRecognitionInput",
566
+ "AutomaticSpeechRecognitionOutput",
567
+ "AutomaticSpeechRecognitionOutputChunk",
568
+ "AutomaticSpeechRecognitionParameters",
569
+ "CONFIG_NAME",
570
+ "CacheNotFound",
571
+ "CachedFileInfo",
572
+ "CachedRepoInfo",
573
+ "CachedRevisionInfo",
574
+ "CardData",
575
+ "ChatCompletionInput",
576
+ "ChatCompletionInputFunctionDefinition",
577
+ "ChatCompletionInputFunctionName",
578
+ "ChatCompletionInputGrammarType",
579
+ "ChatCompletionInputJSONSchema",
580
+ "ChatCompletionInputMessage",
581
+ "ChatCompletionInputMessageChunk",
582
+ "ChatCompletionInputMessageChunkType",
583
+ "ChatCompletionInputResponseFormatJSONObject",
584
+ "ChatCompletionInputResponseFormatJSONSchema",
585
+ "ChatCompletionInputResponseFormatText",
586
+ "ChatCompletionInputStreamOptions",
587
+ "ChatCompletionInputTool",
588
+ "ChatCompletionInputToolCall",
589
+ "ChatCompletionInputToolChoiceClass",
590
+ "ChatCompletionInputToolChoiceEnum",
591
+ "ChatCompletionInputURL",
592
+ "ChatCompletionOutput",
593
+ "ChatCompletionOutputComplete",
594
+ "ChatCompletionOutputFunctionDefinition",
595
+ "ChatCompletionOutputLogprob",
596
+ "ChatCompletionOutputLogprobs",
597
+ "ChatCompletionOutputMessage",
598
+ "ChatCompletionOutputToolCall",
599
+ "ChatCompletionOutputTopLogprob",
600
+ "ChatCompletionOutputUsage",
601
+ "ChatCompletionStreamOutput",
602
+ "ChatCompletionStreamOutputChoice",
603
+ "ChatCompletionStreamOutputDelta",
604
+ "ChatCompletionStreamOutputDeltaToolCall",
605
+ "ChatCompletionStreamOutputFunction",
606
+ "ChatCompletionStreamOutputLogprob",
607
+ "ChatCompletionStreamOutputLogprobs",
608
+ "ChatCompletionStreamOutputTopLogprob",
609
+ "ChatCompletionStreamOutputUsage",
610
+ "Collection",
611
+ "CollectionItem",
612
+ "CommitInfo",
613
+ "CommitOperation",
614
+ "CommitOperationAdd",
615
+ "CommitOperationCopy",
616
+ "CommitOperationDelete",
617
+ "CommitScheduler",
618
+ "CorruptedCacheException",
619
+ "DDUFEntry",
620
+ "DatasetCard",
621
+ "DatasetCardData",
622
+ "DatasetInfo",
623
+ "DeleteCacheStrategy",
624
+ "DepthEstimationInput",
625
+ "DepthEstimationOutput",
626
+ "Discussion",
627
+ "DiscussionComment",
628
+ "DiscussionCommit",
629
+ "DiscussionEvent",
630
+ "DiscussionStatusChange",
631
+ "DiscussionTitleChange",
632
+ "DiscussionWithDetails",
633
+ "DocumentQuestionAnsweringInput",
634
+ "DocumentQuestionAnsweringInputData",
635
+ "DocumentQuestionAnsweringOutputElement",
636
+ "DocumentQuestionAnsweringParameters",
637
+ "EvalResult",
638
+ "FLAX_WEIGHTS_NAME",
639
+ "FeatureExtractionInput",
640
+ "FeatureExtractionInputTruncationDirection",
641
+ "FillMaskInput",
642
+ "FillMaskOutputElement",
643
+ "FillMaskParameters",
644
+ "GitCommitInfo",
645
+ "GitRefInfo",
646
+ "GitRefs",
647
+ "HFCacheInfo",
648
+ "HFSummaryWriter",
649
+ "HUGGINGFACE_CO_URL_HOME",
650
+ "HUGGINGFACE_CO_URL_TEMPLATE",
651
+ "HfApi",
652
+ "HfFileMetadata",
653
+ "HfFileSystem",
654
+ "HfFileSystemFile",
655
+ "HfFileSystemResolvedPath",
656
+ "HfFileSystemStreamFile",
657
+ "HfFolder",
658
+ "ImageClassificationInput",
659
+ "ImageClassificationOutputElement",
660
+ "ImageClassificationOutputTransform",
661
+ "ImageClassificationParameters",
662
+ "ImageSegmentationInput",
663
+ "ImageSegmentationOutputElement",
664
+ "ImageSegmentationParameters",
665
+ "ImageSegmentationSubtask",
666
+ "ImageToImageInput",
667
+ "ImageToImageOutput",
668
+ "ImageToImageParameters",
669
+ "ImageToImageTargetSize",
670
+ "ImageToTextEarlyStoppingEnum",
671
+ "ImageToTextGenerationParameters",
672
+ "ImageToTextInput",
673
+ "ImageToTextOutput",
674
+ "ImageToTextParameters",
675
+ "ImageToVideoInput",
676
+ "ImageToVideoOutput",
677
+ "ImageToVideoParameters",
678
+ "ImageToVideoTargetSize",
679
+ "InferenceApi",
680
+ "InferenceClient",
681
+ "InferenceEndpoint",
682
+ "InferenceEndpointError",
683
+ "InferenceEndpointStatus",
684
+ "InferenceEndpointTimeoutError",
685
+ "InferenceEndpointType",
686
+ "InferenceTimeoutError",
687
+ "JobInfo",
688
+ "JobOwner",
689
+ "JobStage",
690
+ "JobStatus",
691
+ "KerasModelHubMixin",
692
+ "MCPClient",
693
+ "ModelCard",
694
+ "ModelCardData",
695
+ "ModelHubMixin",
696
+ "ModelInfo",
697
+ "OAuthInfo",
698
+ "OAuthOrgInfo",
699
+ "OAuthUserInfo",
700
+ "ObjectDetectionBoundingBox",
701
+ "ObjectDetectionInput",
702
+ "ObjectDetectionOutputElement",
703
+ "ObjectDetectionParameters",
704
+ "Organization",
705
+ "PYTORCH_WEIGHTS_NAME",
706
+ "Padding",
707
+ "PyTorchModelHubMixin",
708
+ "QuestionAnsweringInput",
709
+ "QuestionAnsweringInputData",
710
+ "QuestionAnsweringOutputElement",
711
+ "QuestionAnsweringParameters",
712
+ "REPO_TYPE_DATASET",
713
+ "REPO_TYPE_MODEL",
714
+ "REPO_TYPE_SPACE",
715
+ "RepoCard",
716
+ "RepoUrl",
717
+ "Repository",
718
+ "SentenceSimilarityInput",
719
+ "SentenceSimilarityInputData",
720
+ "SpaceCard",
721
+ "SpaceCardData",
722
+ "SpaceHardware",
723
+ "SpaceInfo",
724
+ "SpaceRuntime",
725
+ "SpaceStage",
726
+ "SpaceStorage",
727
+ "SpaceVariable",
728
+ "StateDictSplit",
729
+ "SummarizationInput",
730
+ "SummarizationOutput",
731
+ "SummarizationParameters",
732
+ "SummarizationTruncationStrategy",
733
+ "TF2_WEIGHTS_NAME",
734
+ "TF_WEIGHTS_NAME",
735
+ "TableQuestionAnsweringInput",
736
+ "TableQuestionAnsweringInputData",
737
+ "TableQuestionAnsweringOutputElement",
738
+ "TableQuestionAnsweringParameters",
739
+ "Text2TextGenerationInput",
740
+ "Text2TextGenerationOutput",
741
+ "Text2TextGenerationParameters",
742
+ "Text2TextGenerationTruncationStrategy",
743
+ "TextClassificationInput",
744
+ "TextClassificationOutputElement",
745
+ "TextClassificationOutputTransform",
746
+ "TextClassificationParameters",
747
+ "TextGenerationInput",
748
+ "TextGenerationInputGenerateParameters",
749
+ "TextGenerationInputGrammarType",
750
+ "TextGenerationOutput",
751
+ "TextGenerationOutputBestOfSequence",
752
+ "TextGenerationOutputDetails",
753
+ "TextGenerationOutputFinishReason",
754
+ "TextGenerationOutputPrefillToken",
755
+ "TextGenerationOutputToken",
756
+ "TextGenerationStreamOutput",
757
+ "TextGenerationStreamOutputStreamDetails",
758
+ "TextGenerationStreamOutputToken",
759
+ "TextToAudioEarlyStoppingEnum",
760
+ "TextToAudioGenerationParameters",
761
+ "TextToAudioInput",
762
+ "TextToAudioOutput",
763
+ "TextToAudioParameters",
764
+ "TextToImageInput",
765
+ "TextToImageOutput",
766
+ "TextToImageParameters",
767
+ "TextToSpeechEarlyStoppingEnum",
768
+ "TextToSpeechGenerationParameters",
769
+ "TextToSpeechInput",
770
+ "TextToSpeechOutput",
771
+ "TextToSpeechParameters",
772
+ "TextToVideoInput",
773
+ "TextToVideoOutput",
774
+ "TextToVideoParameters",
775
+ "TokenClassificationAggregationStrategy",
776
+ "TokenClassificationInput",
777
+ "TokenClassificationOutputElement",
778
+ "TokenClassificationParameters",
779
+ "TranslationInput",
780
+ "TranslationOutput",
781
+ "TranslationParameters",
782
+ "TranslationTruncationStrategy",
783
+ "TypeEnum",
784
+ "User",
785
+ "UserLikes",
786
+ "VideoClassificationInput",
787
+ "VideoClassificationOutputElement",
788
+ "VideoClassificationOutputTransform",
789
+ "VideoClassificationParameters",
790
+ "VisualQuestionAnsweringInput",
791
+ "VisualQuestionAnsweringInputData",
792
+ "VisualQuestionAnsweringOutputElement",
793
+ "VisualQuestionAnsweringParameters",
794
+ "WebhookInfo",
795
+ "WebhookPayload",
796
+ "WebhookPayloadComment",
797
+ "WebhookPayloadDiscussion",
798
+ "WebhookPayloadDiscussionChanges",
799
+ "WebhookPayloadEvent",
800
+ "WebhookPayloadMovedTo",
801
+ "WebhookPayloadRepo",
802
+ "WebhookPayloadUrl",
803
+ "WebhookPayloadWebhook",
804
+ "WebhookWatchedItem",
805
+ "WebhooksServer",
806
+ "ZeroShotClassificationInput",
807
+ "ZeroShotClassificationOutputElement",
808
+ "ZeroShotClassificationParameters",
809
+ "ZeroShotImageClassificationInput",
810
+ "ZeroShotImageClassificationOutputElement",
811
+ "ZeroShotImageClassificationParameters",
812
+ "ZeroShotObjectDetectionBoundingBox",
813
+ "ZeroShotObjectDetectionInput",
814
+ "ZeroShotObjectDetectionOutputElement",
815
+ "ZeroShotObjectDetectionParameters",
816
+ "_CACHED_NO_EXIST",
817
+ "_save_pretrained_fastai",
818
+ "accept_access_request",
819
+ "add_collection_item",
820
+ "add_space_secret",
821
+ "add_space_variable",
822
+ "attach_huggingface_oauth",
823
+ "auth_check",
824
+ "auth_list",
825
+ "auth_switch",
826
+ "cached_assets_path",
827
+ "cancel_access_request",
828
+ "cancel_job",
829
+ "change_discussion_status",
830
+ "comment_discussion",
831
+ "configure_http_backend",
832
+ "create_branch",
833
+ "create_collection",
834
+ "create_commit",
835
+ "create_discussion",
836
+ "create_inference_endpoint",
837
+ "create_inference_endpoint_from_catalog",
838
+ "create_pull_request",
839
+ "create_repo",
840
+ "create_scheduled_job",
841
+ "create_scheduled_uv_job",
842
+ "create_tag",
843
+ "create_webhook",
844
+ "dataset_info",
845
+ "delete_branch",
846
+ "delete_collection",
847
+ "delete_collection_item",
848
+ "delete_file",
849
+ "delete_folder",
850
+ "delete_inference_endpoint",
851
+ "delete_repo",
852
+ "delete_scheduled_job",
853
+ "delete_space_secret",
854
+ "delete_space_storage",
855
+ "delete_space_variable",
856
+ "delete_tag",
857
+ "delete_webhook",
858
+ "disable_webhook",
859
+ "dump_environment_info",
860
+ "duplicate_space",
861
+ "edit_discussion_comment",
862
+ "enable_webhook",
863
+ "export_entries_as_dduf",
864
+ "export_folder_as_dduf",
865
+ "fetch_job_logs",
866
+ "file_exists",
867
+ "from_pretrained_fastai",
868
+ "from_pretrained_keras",
869
+ "get_collection",
870
+ "get_dataset_tags",
871
+ "get_discussion_details",
872
+ "get_full_repo_name",
873
+ "get_hf_file_metadata",
874
+ "get_inference_endpoint",
875
+ "get_model_tags",
876
+ "get_organization_overview",
877
+ "get_paths_info",
878
+ "get_repo_discussions",
879
+ "get_safetensors_metadata",
880
+ "get_session",
881
+ "get_space_runtime",
882
+ "get_space_variables",
883
+ "get_tf_storage_size",
884
+ "get_token",
885
+ "get_token_permission",
886
+ "get_torch_storage_id",
887
+ "get_torch_storage_size",
888
+ "get_user_overview",
889
+ "get_webhook",
890
+ "grant_access",
891
+ "hf_hub_download",
892
+ "hf_hub_url",
893
+ "inspect_job",
894
+ "inspect_scheduled_job",
895
+ "interpreter_login",
896
+ "list_accepted_access_requests",
897
+ "list_collections",
898
+ "list_datasets",
899
+ "list_inference_catalog",
900
+ "list_inference_endpoints",
901
+ "list_jobs",
902
+ "list_lfs_files",
903
+ "list_liked_repos",
904
+ "list_models",
905
+ "list_organization_members",
906
+ "list_papers",
907
+ "list_pending_access_requests",
908
+ "list_rejected_access_requests",
909
+ "list_repo_commits",
910
+ "list_repo_files",
911
+ "list_repo_likers",
912
+ "list_repo_refs",
913
+ "list_repo_tree",
914
+ "list_spaces",
915
+ "list_user_followers",
916
+ "list_user_following",
917
+ "list_webhooks",
918
+ "load_state_dict_from_file",
919
+ "load_torch_model",
920
+ "logging",
921
+ "login",
922
+ "logout",
923
+ "merge_pull_request",
924
+ "metadata_eval_result",
925
+ "metadata_load",
926
+ "metadata_save",
927
+ "metadata_update",
928
+ "model_info",
929
+ "move_repo",
930
+ "notebook_login",
931
+ "paper_info",
932
+ "parse_huggingface_oauth",
933
+ "parse_safetensors_file_metadata",
934
+ "pause_inference_endpoint",
935
+ "pause_space",
936
+ "permanently_delete_lfs_files",
937
+ "preupload_lfs_files",
938
+ "push_to_hub_fastai",
939
+ "push_to_hub_keras",
940
+ "read_dduf_file",
941
+ "reject_access_request",
942
+ "rename_discussion",
943
+ "repo_exists",
944
+ "repo_info",
945
+ "repo_type_and_id_from_hf_id",
946
+ "request_space_hardware",
947
+ "request_space_storage",
948
+ "restart_space",
949
+ "resume_inference_endpoint",
950
+ "resume_scheduled_job",
951
+ "revision_exists",
952
+ "run_as_future",
953
+ "run_job",
954
+ "run_uv_job",
955
+ "save_pretrained_keras",
956
+ "save_torch_model",
957
+ "save_torch_state_dict",
958
+ "scale_to_zero_inference_endpoint",
959
+ "scan_cache_dir",
960
+ "set_space_sleep_time",
961
+ "snapshot_download",
962
+ "space_info",
963
+ "split_state_dict_into_shards_factory",
964
+ "split_tf_state_dict_into_shards",
965
+ "split_torch_state_dict_into_shards",
966
+ "super_squash_history",
967
+ "suspend_scheduled_job",
968
+ "try_to_load_from_cache",
969
+ "unlike",
970
+ "update_collection_item",
971
+ "update_collection_metadata",
972
+ "update_inference_endpoint",
973
+ "update_repo_settings",
974
+ "update_repo_visibility",
975
+ "update_webhook",
976
+ "upload_file",
977
+ "upload_folder",
978
+ "upload_large_folder",
979
+ "webhook_endpoint",
980
+ "whoami",
981
+ ]
982
+
983
+
984
+ def _attach(package_name, submodules=None, submod_attrs=None):
985
+ """Attach lazily loaded submodules, functions, or other attributes.
986
+
987
+ Typically, modules import submodules and attributes as follows:
988
+
989
+ ```py
990
+ import mysubmodule
991
+ import anothersubmodule
992
+
993
+ from .foo import someattr
994
+ ```
995
+
996
+ The idea is to replace a package's `__getattr__`, `__dir__`, such that all imports
997
+ work exactly the way they would with normal imports, except that the import occurs
998
+ upon first use.
999
+
1000
+ The typical way to call this function, replacing the above imports, is:
1001
+
1002
+ ```python
1003
+ __getattr__, __dir__ = lazy.attach(
1004
+ __name__,
1005
+ ['mysubmodule', 'anothersubmodule'],
1006
+ {'foo': ['someattr']}
1007
+ )
1008
+ ```
1009
+ This functionality requires Python 3.7 or higher.
1010
+
1011
+ Args:
1012
+ package_name (`str`):
1013
+ Typically use `__name__`.
1014
+ submodules (`set`):
1015
+ List of submodules to attach.
1016
+ submod_attrs (`dict`):
1017
+ Dictionary of submodule -> list of attributes / functions.
1018
+ These attributes are imported as they are used.
1019
+
1020
+ Returns:
1021
+ __getattr__, __dir__, __all__
1022
+
1023
+ """
1024
+ if submod_attrs is None:
1025
+ submod_attrs = {}
1026
+
1027
+ if submodules is None:
1028
+ submodules = set()
1029
+ else:
1030
+ submodules = set(submodules)
1031
+
1032
+ attr_to_modules = {attr: mod for mod, attrs in submod_attrs.items() for attr in attrs}
1033
+
1034
+ def __getattr__(name):
1035
+ if name in submodules:
1036
+ try:
1037
+ return importlib.import_module(f"{package_name}.{name}")
1038
+ except Exception as e:
1039
+ print(f"Error importing {package_name}.{name}: {e}")
1040
+ raise
1041
+ elif name in attr_to_modules:
1042
+ submod_path = f"{package_name}.{attr_to_modules[name]}"
1043
+ try:
1044
+ submod = importlib.import_module(submod_path)
1045
+ except Exception as e:
1046
+ print(f"Error importing {submod_path}: {e}")
1047
+ raise
1048
+ attr = getattr(submod, name)
1049
+
1050
+ # If the attribute lives in a file (module) with the same
1051
+ # name as the attribute, ensure that the attribute and *not*
1052
+ # the module is accessible on the package.
1053
+ if name == attr_to_modules[name]:
1054
+ pkg = sys.modules[package_name]
1055
+ pkg.__dict__[name] = attr
1056
+
1057
+ return attr
1058
+ else:
1059
+ raise AttributeError(f"No {package_name} attribute {name}")
1060
+
1061
+ def __dir__():
1062
+ return __all__
1063
+
1064
+ return __getattr__, __dir__
1065
+
1066
+
1067
+ __getattr__, __dir__ = _attach(__name__, submodules=[], submod_attrs=_SUBMOD_ATTRS)
1068
+
1069
+ if os.environ.get("EAGER_IMPORT", ""):
1070
+ for attr in __all__:
1071
+ __getattr__(attr)
1072
+
1073
+ # WARNING: any content below this statement is generated automatically. Any manual edit
1074
+ # will be lost when re-generating this file !
1075
+ #
1076
+ # To update the static imports, please run the following command and commit the changes.
1077
+ # ```
1078
+ # # Use script
1079
+ # python utils/check_static_imports.py --update
1080
+ #
1081
+ # # Or run style on codebase
1082
+ # make style
1083
+ # ```
1084
+ if TYPE_CHECKING: # pragma: no cover
1085
+ from ._commit_scheduler import CommitScheduler # noqa: F401
1086
+ from ._inference_endpoints import (
1087
+ InferenceEndpoint, # noqa: F401
1088
+ InferenceEndpointError, # noqa: F401
1089
+ InferenceEndpointStatus, # noqa: F401
1090
+ InferenceEndpointTimeoutError, # noqa: F401
1091
+ InferenceEndpointType, # noqa: F401
1092
+ )
1093
+ from ._jobs_api import (
1094
+ JobInfo, # noqa: F401
1095
+ JobOwner, # noqa: F401
1096
+ JobStage, # noqa: F401
1097
+ JobStatus, # noqa: F401
1098
+ )
1099
+ from ._login import (
1100
+ auth_list, # noqa: F401
1101
+ auth_switch, # noqa: F401
1102
+ interpreter_login, # noqa: F401
1103
+ login, # noqa: F401
1104
+ logout, # noqa: F401
1105
+ notebook_login, # noqa: F401
1106
+ )
1107
+ from ._oauth import (
1108
+ OAuthInfo, # noqa: F401
1109
+ OAuthOrgInfo, # noqa: F401
1110
+ OAuthUserInfo, # noqa: F401
1111
+ attach_huggingface_oauth, # noqa: F401
1112
+ parse_huggingface_oauth, # noqa: F401
1113
+ )
1114
+ from ._snapshot_download import snapshot_download # noqa: F401
1115
+ from ._space_api import (
1116
+ SpaceHardware, # noqa: F401
1117
+ SpaceRuntime, # noqa: F401
1118
+ SpaceStage, # noqa: F401
1119
+ SpaceStorage, # noqa: F401
1120
+ SpaceVariable, # noqa: F401
1121
+ )
1122
+ from ._tensorboard_logger import HFSummaryWriter # noqa: F401
1123
+ from ._webhooks_payload import (
1124
+ WebhookPayload, # noqa: F401
1125
+ WebhookPayloadComment, # noqa: F401
1126
+ WebhookPayloadDiscussion, # noqa: F401
1127
+ WebhookPayloadDiscussionChanges, # noqa: F401
1128
+ WebhookPayloadEvent, # noqa: F401
1129
+ WebhookPayloadMovedTo, # noqa: F401
1130
+ WebhookPayloadRepo, # noqa: F401
1131
+ WebhookPayloadUrl, # noqa: F401
1132
+ WebhookPayloadWebhook, # noqa: F401
1133
+ )
1134
+ from ._webhooks_server import (
1135
+ WebhooksServer, # noqa: F401
1136
+ webhook_endpoint, # noqa: F401
1137
+ )
1138
+ from .community import (
1139
+ Discussion, # noqa: F401
1140
+ DiscussionComment, # noqa: F401
1141
+ DiscussionCommit, # noqa: F401
1142
+ DiscussionEvent, # noqa: F401
1143
+ DiscussionStatusChange, # noqa: F401
1144
+ DiscussionTitleChange, # noqa: F401
1145
+ DiscussionWithDetails, # noqa: F401
1146
+ )
1147
+ from .constants import (
1148
+ CONFIG_NAME, # noqa: F401
1149
+ FLAX_WEIGHTS_NAME, # noqa: F401
1150
+ HUGGINGFACE_CO_URL_HOME, # noqa: F401
1151
+ HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401
1152
+ PYTORCH_WEIGHTS_NAME, # noqa: F401
1153
+ REPO_TYPE_DATASET, # noqa: F401
1154
+ REPO_TYPE_MODEL, # noqa: F401
1155
+ REPO_TYPE_SPACE, # noqa: F401
1156
+ TF2_WEIGHTS_NAME, # noqa: F401
1157
+ TF_WEIGHTS_NAME, # noqa: F401
1158
+ )
1159
+ from .fastai_utils import (
1160
+ _save_pretrained_fastai, # noqa: F401
1161
+ from_pretrained_fastai, # noqa: F401
1162
+ push_to_hub_fastai, # noqa: F401
1163
+ )
1164
+ from .file_download import (
1165
+ _CACHED_NO_EXIST, # noqa: F401
1166
+ HfFileMetadata, # noqa: F401
1167
+ get_hf_file_metadata, # noqa: F401
1168
+ hf_hub_download, # noqa: F401
1169
+ hf_hub_url, # noqa: F401
1170
+ try_to_load_from_cache, # noqa: F401
1171
+ )
1172
+ from .hf_api import (
1173
+ Collection, # noqa: F401
1174
+ CollectionItem, # noqa: F401
1175
+ CommitInfo, # noqa: F401
1176
+ CommitOperation, # noqa: F401
1177
+ CommitOperationAdd, # noqa: F401
1178
+ CommitOperationCopy, # noqa: F401
1179
+ CommitOperationDelete, # noqa: F401
1180
+ DatasetInfo, # noqa: F401
1181
+ GitCommitInfo, # noqa: F401
1182
+ GitRefInfo, # noqa: F401
1183
+ GitRefs, # noqa: F401
1184
+ HfApi, # noqa: F401
1185
+ ModelInfo, # noqa: F401
1186
+ Organization, # noqa: F401
1187
+ RepoUrl, # noqa: F401
1188
+ SpaceInfo, # noqa: F401
1189
+ User, # noqa: F401
1190
+ UserLikes, # noqa: F401
1191
+ WebhookInfo, # noqa: F401
1192
+ WebhookWatchedItem, # noqa: F401
1193
+ accept_access_request, # noqa: F401
1194
+ add_collection_item, # noqa: F401
1195
+ add_space_secret, # noqa: F401
1196
+ add_space_variable, # noqa: F401
1197
+ auth_check, # noqa: F401
1198
+ cancel_access_request, # noqa: F401
1199
+ cancel_job, # noqa: F401
1200
+ change_discussion_status, # noqa: F401
1201
+ comment_discussion, # noqa: F401
1202
+ create_branch, # noqa: F401
1203
+ create_collection, # noqa: F401
1204
+ create_commit, # noqa: F401
1205
+ create_discussion, # noqa: F401
1206
+ create_inference_endpoint, # noqa: F401
1207
+ create_inference_endpoint_from_catalog, # noqa: F401
1208
+ create_pull_request, # noqa: F401
1209
+ create_repo, # noqa: F401
1210
+ create_scheduled_job, # noqa: F401
1211
+ create_scheduled_uv_job, # noqa: F401
1212
+ create_tag, # noqa: F401
1213
+ create_webhook, # noqa: F401
1214
+ dataset_info, # noqa: F401
1215
+ delete_branch, # noqa: F401
1216
+ delete_collection, # noqa: F401
1217
+ delete_collection_item, # noqa: F401
1218
+ delete_file, # noqa: F401
1219
+ delete_folder, # noqa: F401
1220
+ delete_inference_endpoint, # noqa: F401
1221
+ delete_repo, # noqa: F401
1222
+ delete_scheduled_job, # noqa: F401
1223
+ delete_space_secret, # noqa: F401
1224
+ delete_space_storage, # noqa: F401
1225
+ delete_space_variable, # noqa: F401
1226
+ delete_tag, # noqa: F401
1227
+ delete_webhook, # noqa: F401
1228
+ disable_webhook, # noqa: F401
1229
+ duplicate_space, # noqa: F401
1230
+ edit_discussion_comment, # noqa: F401
1231
+ enable_webhook, # noqa: F401
1232
+ fetch_job_logs, # noqa: F401
1233
+ file_exists, # noqa: F401
1234
+ get_collection, # noqa: F401
1235
+ get_dataset_tags, # noqa: F401
1236
+ get_discussion_details, # noqa: F401
1237
+ get_full_repo_name, # noqa: F401
1238
+ get_inference_endpoint, # noqa: F401
1239
+ get_model_tags, # noqa: F401
1240
+ get_organization_overview, # noqa: F401
1241
+ get_paths_info, # noqa: F401
1242
+ get_repo_discussions, # noqa: F401
1243
+ get_safetensors_metadata, # noqa: F401
1244
+ get_space_runtime, # noqa: F401
1245
+ get_space_variables, # noqa: F401
1246
+ get_token_permission, # noqa: F401
1247
+ get_user_overview, # noqa: F401
1248
+ get_webhook, # noqa: F401
1249
+ grant_access, # noqa: F401
1250
+ inspect_job, # noqa: F401
1251
+ inspect_scheduled_job, # noqa: F401
1252
+ list_accepted_access_requests, # noqa: F401
1253
+ list_collections, # noqa: F401
1254
+ list_datasets, # noqa: F401
1255
+ list_inference_catalog, # noqa: F401
1256
+ list_inference_endpoints, # noqa: F401
1257
+ list_jobs, # noqa: F401
1258
+ list_lfs_files, # noqa: F401
1259
+ list_liked_repos, # noqa: F401
1260
+ list_models, # noqa: F401
1261
+ list_organization_members, # noqa: F401
1262
+ list_papers, # noqa: F401
1263
+ list_pending_access_requests, # noqa: F401
1264
+ list_rejected_access_requests, # noqa: F401
1265
+ list_repo_commits, # noqa: F401
1266
+ list_repo_files, # noqa: F401
1267
+ list_repo_likers, # noqa: F401
1268
+ list_repo_refs, # noqa: F401
1269
+ list_repo_tree, # noqa: F401
1270
+ list_spaces, # noqa: F401
1271
+ list_user_followers, # noqa: F401
1272
+ list_user_following, # noqa: F401
1273
+ list_webhooks, # noqa: F401
1274
+ merge_pull_request, # noqa: F401
1275
+ model_info, # noqa: F401
1276
+ move_repo, # noqa: F401
1277
+ paper_info, # noqa: F401
1278
+ parse_safetensors_file_metadata, # noqa: F401
1279
+ pause_inference_endpoint, # noqa: F401
1280
+ pause_space, # noqa: F401
1281
+ permanently_delete_lfs_files, # noqa: F401
1282
+ preupload_lfs_files, # noqa: F401
1283
+ reject_access_request, # noqa: F401
1284
+ rename_discussion, # noqa: F401
1285
+ repo_exists, # noqa: F401
1286
+ repo_info, # noqa: F401
1287
+ repo_type_and_id_from_hf_id, # noqa: F401
1288
+ request_space_hardware, # noqa: F401
1289
+ request_space_storage, # noqa: F401
1290
+ restart_space, # noqa: F401
1291
+ resume_inference_endpoint, # noqa: F401
1292
+ resume_scheduled_job, # noqa: F401
1293
+ revision_exists, # noqa: F401
1294
+ run_as_future, # noqa: F401
1295
+ run_job, # noqa: F401
1296
+ run_uv_job, # noqa: F401
1297
+ scale_to_zero_inference_endpoint, # noqa: F401
1298
+ set_space_sleep_time, # noqa: F401
1299
+ space_info, # noqa: F401
1300
+ super_squash_history, # noqa: F401
1301
+ suspend_scheduled_job, # noqa: F401
1302
+ unlike, # noqa: F401
1303
+ update_collection_item, # noqa: F401
1304
+ update_collection_metadata, # noqa: F401
1305
+ update_inference_endpoint, # noqa: F401
1306
+ update_repo_settings, # noqa: F401
1307
+ update_repo_visibility, # noqa: F401
1308
+ update_webhook, # noqa: F401
1309
+ upload_file, # noqa: F401
1310
+ upload_folder, # noqa: F401
1311
+ upload_large_folder, # noqa: F401
1312
+ whoami, # noqa: F401
1313
+ )
1314
+ from .hf_file_system import (
1315
+ HfFileSystem, # noqa: F401
1316
+ HfFileSystemFile, # noqa: F401
1317
+ HfFileSystemResolvedPath, # noqa: F401
1318
+ HfFileSystemStreamFile, # noqa: F401
1319
+ )
1320
+ from .hub_mixin import (
1321
+ ModelHubMixin, # noqa: F401
1322
+ PyTorchModelHubMixin, # noqa: F401
1323
+ )
1324
+ from .inference._client import (
1325
+ InferenceClient, # noqa: F401
1326
+ InferenceTimeoutError, # noqa: F401
1327
+ )
1328
+ from .inference._generated._async_client import AsyncInferenceClient # noqa: F401
1329
+ from .inference._generated.types import (
1330
+ AudioClassificationInput, # noqa: F401
1331
+ AudioClassificationOutputElement, # noqa: F401
1332
+ AudioClassificationOutputTransform, # noqa: F401
1333
+ AudioClassificationParameters, # noqa: F401
1334
+ AudioToAudioInput, # noqa: F401
1335
+ AudioToAudioOutputElement, # noqa: F401
1336
+ AutomaticSpeechRecognitionEarlyStoppingEnum, # noqa: F401
1337
+ AutomaticSpeechRecognitionGenerationParameters, # noqa: F401
1338
+ AutomaticSpeechRecognitionInput, # noqa: F401
1339
+ AutomaticSpeechRecognitionOutput, # noqa: F401
1340
+ AutomaticSpeechRecognitionOutputChunk, # noqa: F401
1341
+ AutomaticSpeechRecognitionParameters, # noqa: F401
1342
+ ChatCompletionInput, # noqa: F401
1343
+ ChatCompletionInputFunctionDefinition, # noqa: F401
1344
+ ChatCompletionInputFunctionName, # noqa: F401
1345
+ ChatCompletionInputGrammarType, # noqa: F401
1346
+ ChatCompletionInputJSONSchema, # noqa: F401
1347
+ ChatCompletionInputMessage, # noqa: F401
1348
+ ChatCompletionInputMessageChunk, # noqa: F401
1349
+ ChatCompletionInputMessageChunkType, # noqa: F401
1350
+ ChatCompletionInputResponseFormatJSONObject, # noqa: F401
1351
+ ChatCompletionInputResponseFormatJSONSchema, # noqa: F401
1352
+ ChatCompletionInputResponseFormatText, # noqa: F401
1353
+ ChatCompletionInputStreamOptions, # noqa: F401
1354
+ ChatCompletionInputTool, # noqa: F401
1355
+ ChatCompletionInputToolCall, # noqa: F401
1356
+ ChatCompletionInputToolChoiceClass, # noqa: F401
1357
+ ChatCompletionInputToolChoiceEnum, # noqa: F401
1358
+ ChatCompletionInputURL, # noqa: F401
1359
+ ChatCompletionOutput, # noqa: F401
1360
+ ChatCompletionOutputComplete, # noqa: F401
1361
+ ChatCompletionOutputFunctionDefinition, # noqa: F401
1362
+ ChatCompletionOutputLogprob, # noqa: F401
1363
+ ChatCompletionOutputLogprobs, # noqa: F401
1364
+ ChatCompletionOutputMessage, # noqa: F401
1365
+ ChatCompletionOutputToolCall, # noqa: F401
1366
+ ChatCompletionOutputTopLogprob, # noqa: F401
1367
+ ChatCompletionOutputUsage, # noqa: F401
1368
+ ChatCompletionStreamOutput, # noqa: F401
1369
+ ChatCompletionStreamOutputChoice, # noqa: F401
1370
+ ChatCompletionStreamOutputDelta, # noqa: F401
1371
+ ChatCompletionStreamOutputDeltaToolCall, # noqa: F401
1372
+ ChatCompletionStreamOutputFunction, # noqa: F401
1373
+ ChatCompletionStreamOutputLogprob, # noqa: F401
1374
+ ChatCompletionStreamOutputLogprobs, # noqa: F401
1375
+ ChatCompletionStreamOutputTopLogprob, # noqa: F401
1376
+ ChatCompletionStreamOutputUsage, # noqa: F401
1377
+ DepthEstimationInput, # noqa: F401
1378
+ DepthEstimationOutput, # noqa: F401
1379
+ DocumentQuestionAnsweringInput, # noqa: F401
1380
+ DocumentQuestionAnsweringInputData, # noqa: F401
1381
+ DocumentQuestionAnsweringOutputElement, # noqa: F401
1382
+ DocumentQuestionAnsweringParameters, # noqa: F401
1383
+ FeatureExtractionInput, # noqa: F401
1384
+ FeatureExtractionInputTruncationDirection, # noqa: F401
1385
+ FillMaskInput, # noqa: F401
1386
+ FillMaskOutputElement, # noqa: F401
1387
+ FillMaskParameters, # noqa: F401
1388
+ ImageClassificationInput, # noqa: F401
1389
+ ImageClassificationOutputElement, # noqa: F401
1390
+ ImageClassificationOutputTransform, # noqa: F401
1391
+ ImageClassificationParameters, # noqa: F401
1392
+ ImageSegmentationInput, # noqa: F401
1393
+ ImageSegmentationOutputElement, # noqa: F401
1394
+ ImageSegmentationParameters, # noqa: F401
1395
+ ImageSegmentationSubtask, # noqa: F401
1396
+ ImageToImageInput, # noqa: F401
1397
+ ImageToImageOutput, # noqa: F401
1398
+ ImageToImageParameters, # noqa: F401
1399
+ ImageToImageTargetSize, # noqa: F401
1400
+ ImageToTextEarlyStoppingEnum, # noqa: F401
1401
+ ImageToTextGenerationParameters, # noqa: F401
1402
+ ImageToTextInput, # noqa: F401
1403
+ ImageToTextOutput, # noqa: F401
1404
+ ImageToTextParameters, # noqa: F401
1405
+ ImageToVideoInput, # noqa: F401
1406
+ ImageToVideoOutput, # noqa: F401
1407
+ ImageToVideoParameters, # noqa: F401
1408
+ ImageToVideoTargetSize, # noqa: F401
1409
+ ObjectDetectionBoundingBox, # noqa: F401
1410
+ ObjectDetectionInput, # noqa: F401
1411
+ ObjectDetectionOutputElement, # noqa: F401
1412
+ ObjectDetectionParameters, # noqa: F401
1413
+ Padding, # noqa: F401
1414
+ QuestionAnsweringInput, # noqa: F401
1415
+ QuestionAnsweringInputData, # noqa: F401
1416
+ QuestionAnsweringOutputElement, # noqa: F401
1417
+ QuestionAnsweringParameters, # noqa: F401
1418
+ SentenceSimilarityInput, # noqa: F401
1419
+ SentenceSimilarityInputData, # noqa: F401
1420
+ SummarizationInput, # noqa: F401
1421
+ SummarizationOutput, # noqa: F401
1422
+ SummarizationParameters, # noqa: F401
1423
+ SummarizationTruncationStrategy, # noqa: F401
1424
+ TableQuestionAnsweringInput, # noqa: F401
1425
+ TableQuestionAnsweringInputData, # noqa: F401
1426
+ TableQuestionAnsweringOutputElement, # noqa: F401
1427
+ TableQuestionAnsweringParameters, # noqa: F401
1428
+ Text2TextGenerationInput, # noqa: F401
1429
+ Text2TextGenerationOutput, # noqa: F401
1430
+ Text2TextGenerationParameters, # noqa: F401
1431
+ Text2TextGenerationTruncationStrategy, # noqa: F401
1432
+ TextClassificationInput, # noqa: F401
1433
+ TextClassificationOutputElement, # noqa: F401
1434
+ TextClassificationOutputTransform, # noqa: F401
1435
+ TextClassificationParameters, # noqa: F401
1436
+ TextGenerationInput, # noqa: F401
1437
+ TextGenerationInputGenerateParameters, # noqa: F401
1438
+ TextGenerationInputGrammarType, # noqa: F401
1439
+ TextGenerationOutput, # noqa: F401
1440
+ TextGenerationOutputBestOfSequence, # noqa: F401
1441
+ TextGenerationOutputDetails, # noqa: F401
1442
+ TextGenerationOutputFinishReason, # noqa: F401
1443
+ TextGenerationOutputPrefillToken, # noqa: F401
1444
+ TextGenerationOutputToken, # noqa: F401
1445
+ TextGenerationStreamOutput, # noqa: F401
1446
+ TextGenerationStreamOutputStreamDetails, # noqa: F401
1447
+ TextGenerationStreamOutputToken, # noqa: F401
1448
+ TextToAudioEarlyStoppingEnum, # noqa: F401
1449
+ TextToAudioGenerationParameters, # noqa: F401
1450
+ TextToAudioInput, # noqa: F401
1451
+ TextToAudioOutput, # noqa: F401
1452
+ TextToAudioParameters, # noqa: F401
1453
+ TextToImageInput, # noqa: F401
1454
+ TextToImageOutput, # noqa: F401
1455
+ TextToImageParameters, # noqa: F401
1456
+ TextToSpeechEarlyStoppingEnum, # noqa: F401
1457
+ TextToSpeechGenerationParameters, # noqa: F401
1458
+ TextToSpeechInput, # noqa: F401
1459
+ TextToSpeechOutput, # noqa: F401
1460
+ TextToSpeechParameters, # noqa: F401
1461
+ TextToVideoInput, # noqa: F401
1462
+ TextToVideoOutput, # noqa: F401
1463
+ TextToVideoParameters, # noqa: F401
1464
+ TokenClassificationAggregationStrategy, # noqa: F401
1465
+ TokenClassificationInput, # noqa: F401
1466
+ TokenClassificationOutputElement, # noqa: F401
1467
+ TokenClassificationParameters, # noqa: F401
1468
+ TranslationInput, # noqa: F401
1469
+ TranslationOutput, # noqa: F401
1470
+ TranslationParameters, # noqa: F401
1471
+ TranslationTruncationStrategy, # noqa: F401
1472
+ TypeEnum, # noqa: F401
1473
+ VideoClassificationInput, # noqa: F401
1474
+ VideoClassificationOutputElement, # noqa: F401
1475
+ VideoClassificationOutputTransform, # noqa: F401
1476
+ VideoClassificationParameters, # noqa: F401
1477
+ VisualQuestionAnsweringInput, # noqa: F401
1478
+ VisualQuestionAnsweringInputData, # noqa: F401
1479
+ VisualQuestionAnsweringOutputElement, # noqa: F401
1480
+ VisualQuestionAnsweringParameters, # noqa: F401
1481
+ ZeroShotClassificationInput, # noqa: F401
1482
+ ZeroShotClassificationOutputElement, # noqa: F401
1483
+ ZeroShotClassificationParameters, # noqa: F401
1484
+ ZeroShotImageClassificationInput, # noqa: F401
1485
+ ZeroShotImageClassificationOutputElement, # noqa: F401
1486
+ ZeroShotImageClassificationParameters, # noqa: F401
1487
+ ZeroShotObjectDetectionBoundingBox, # noqa: F401
1488
+ ZeroShotObjectDetectionInput, # noqa: F401
1489
+ ZeroShotObjectDetectionOutputElement, # noqa: F401
1490
+ ZeroShotObjectDetectionParameters, # noqa: F401
1491
+ )
1492
+ from .inference._mcp.agent import Agent # noqa: F401
1493
+ from .inference._mcp.mcp_client import MCPClient # noqa: F401
1494
+ from .inference_api import InferenceApi # noqa: F401
1495
+ from .keras_mixin import (
1496
+ KerasModelHubMixin, # noqa: F401
1497
+ from_pretrained_keras, # noqa: F401
1498
+ push_to_hub_keras, # noqa: F401
1499
+ save_pretrained_keras, # noqa: F401
1500
+ )
1501
+ from .repocard import (
1502
+ DatasetCard, # noqa: F401
1503
+ ModelCard, # noqa: F401
1504
+ RepoCard, # noqa: F401
1505
+ SpaceCard, # noqa: F401
1506
+ metadata_eval_result, # noqa: F401
1507
+ metadata_load, # noqa: F401
1508
+ metadata_save, # noqa: F401
1509
+ metadata_update, # noqa: F401
1510
+ )
1511
+ from .repocard_data import (
1512
+ CardData, # noqa: F401
1513
+ DatasetCardData, # noqa: F401
1514
+ EvalResult, # noqa: F401
1515
+ ModelCardData, # noqa: F401
1516
+ SpaceCardData, # noqa: F401
1517
+ )
1518
+ from .repository import Repository # noqa: F401
1519
+ from .serialization import (
1520
+ StateDictSplit, # noqa: F401
1521
+ get_tf_storage_size, # noqa: F401
1522
+ get_torch_storage_id, # noqa: F401
1523
+ get_torch_storage_size, # noqa: F401
1524
+ load_state_dict_from_file, # noqa: F401
1525
+ load_torch_model, # noqa: F401
1526
+ save_torch_model, # noqa: F401
1527
+ save_torch_state_dict, # noqa: F401
1528
+ split_state_dict_into_shards_factory, # noqa: F401
1529
+ split_tf_state_dict_into_shards, # noqa: F401
1530
+ split_torch_state_dict_into_shards, # noqa: F401
1531
+ )
1532
+ from .serialization._dduf import (
1533
+ DDUFEntry, # noqa: F401
1534
+ export_entries_as_dduf, # noqa: F401
1535
+ export_folder_as_dduf, # noqa: F401
1536
+ read_dduf_file, # noqa: F401
1537
+ )
1538
+ from .utils import (
1539
+ CachedFileInfo, # noqa: F401
1540
+ CachedRepoInfo, # noqa: F401
1541
+ CachedRevisionInfo, # noqa: F401
1542
+ CacheNotFound, # noqa: F401
1543
+ CorruptedCacheException, # noqa: F401
1544
+ DeleteCacheStrategy, # noqa: F401
1545
+ HFCacheInfo, # noqa: F401
1546
+ HfFolder, # noqa: F401
1547
+ cached_assets_path, # noqa: F401
1548
+ configure_http_backend, # noqa: F401
1549
+ dump_environment_info, # noqa: F401
1550
+ get_session, # noqa: F401
1551
+ get_token, # noqa: F401
1552
+ logging, # noqa: F401
1553
+ scan_cache_dir, # noqa: F401
1554
+ )
venv/lib/python3.13/site-packages/huggingface_hub/_commit_api.py ADDED
@@ -0,0 +1,968 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Type definitions and utilities for the `create_commit` API
3
+ """
4
+
5
+ import base64
6
+ import io
7
+ import os
8
+ import warnings
9
+ from collections import defaultdict
10
+ from contextlib import contextmanager
11
+ from dataclasses import dataclass, field
12
+ from itertools import groupby
13
+ from pathlib import Path, PurePosixPath
14
+ from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union
15
+
16
+ from tqdm.contrib.concurrent import thread_map
17
+
18
+ from . import constants
19
+ from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
20
+ from .file_download import hf_hub_url
21
+ from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
22
+ from .utils import (
23
+ FORBIDDEN_FOLDERS,
24
+ XetTokenType,
25
+ are_progress_bars_disabled,
26
+ chunk_iterable,
27
+ fetch_xet_connection_info_from_repo_info,
28
+ get_session,
29
+ hf_raise_for_status,
30
+ logging,
31
+ sha,
32
+ tqdm_stream_file,
33
+ validate_hf_hub_args,
34
+ )
35
+ from .utils import tqdm as hf_tqdm
36
+ from .utils._runtime import is_xet_available
37
+
38
+
39
+ if TYPE_CHECKING:
40
+ from .hf_api import RepoFile
41
+
42
+
43
+ logger = logging.get_logger(__name__)
44
+
45
+
46
+ UploadMode = Literal["lfs", "regular"]
47
+
48
+ # Max is 1,000 per request on the Hub for HfApi.get_paths_info
49
+ # Otherwise we get:
50
+ # HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
51
+ # See https://github.com/huggingface/huggingface_hub/issues/1503
52
+ FETCH_LFS_BATCH_SIZE = 500
53
+
54
+ UPLOAD_BATCH_MAX_NUM_FILES = 256
55
+
56
+
57
+ @dataclass
58
+ class CommitOperationDelete:
59
+ """
60
+ Data structure holding necessary info to delete a file or a folder from a repository
61
+ on the Hub.
62
+
63
+ Args:
64
+ path_in_repo (`str`):
65
+ Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
66
+ for a file or `"checkpoints/1fec34a/"` for a folder.
67
+ is_folder (`bool` or `Literal["auto"]`, *optional*)
68
+ Whether the Delete Operation applies to a folder or not. If "auto", the path
69
+ type (file or folder) is guessed automatically by looking if path ends with
70
+ a "/" (folder) or not (file). To explicitly set the path type, you can set
71
+ `is_folder=True` or `is_folder=False`.
72
+ """
73
+
74
+ path_in_repo: str
75
+ is_folder: Union[bool, Literal["auto"]] = "auto"
76
+
77
+ def __post_init__(self):
78
+ self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
79
+
80
+ if self.is_folder == "auto":
81
+ self.is_folder = self.path_in_repo.endswith("/")
82
+ if not isinstance(self.is_folder, bool):
83
+ raise ValueError(
84
+ f"Wrong value for `is_folder`. Must be one of [`True`, `False`, `'auto'`]. Got '{self.is_folder}'."
85
+ )
86
+
87
+
88
+ @dataclass
89
+ class CommitOperationCopy:
90
+ """
91
+ Data structure holding necessary info to copy a file in a repository on the Hub.
92
+
93
+ Limitations:
94
+ - Only LFS files can be copied. To copy a regular file, you need to download it locally and re-upload it
95
+ - Cross-repository copies are not supported.
96
+
97
+ Note: you can combine a [`CommitOperationCopy`] and a [`CommitOperationDelete`] to rename an LFS file on the Hub.
98
+
99
+ Args:
100
+ src_path_in_repo (`str`):
101
+ Relative filepath in the repo of the file to be copied, e.g. `"checkpoints/1fec34a/weights.bin"`.
102
+ path_in_repo (`str`):
103
+ Relative filepath in the repo where to copy the file, e.g. `"checkpoints/1fec34a/weights_copy.bin"`.
104
+ src_revision (`str`, *optional*):
105
+ The git revision of the file to be copied. Can be any valid git revision.
106
+ Default to the target commit revision.
107
+ """
108
+
109
+ src_path_in_repo: str
110
+ path_in_repo: str
111
+ src_revision: Optional[str] = None
112
+ # set to the OID of the file to be copied if it has already been uploaded
113
+ # useful to determine if a commit will be empty or not.
114
+ _src_oid: Optional[str] = None
115
+ # set to the OID of the file to copy to if it has already been uploaded
116
+ # useful to determine if a commit will be empty or not.
117
+ _dest_oid: Optional[str] = None
118
+
119
+ def __post_init__(self):
120
+ self.src_path_in_repo = _validate_path_in_repo(self.src_path_in_repo)
121
+ self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
122
+
123
+
124
+ @dataclass
125
+ class CommitOperationAdd:
126
+ """
127
+ Data structure holding necessary info to upload a file to a repository on the Hub.
128
+
129
+ Args:
130
+ path_in_repo (`str`):
131
+ Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
132
+ path_or_fileobj (`str`, `Path`, `bytes`, or `BinaryIO`):
133
+ Either:
134
+ - a path to a local file (as `str` or `pathlib.Path`) to upload
135
+ - a buffer of bytes (`bytes`) holding the content of the file to upload
136
+ - a "file object" (subclass of `io.BufferedIOBase`), typically obtained
137
+ with `open(path, "rb")`. It must support `seek()` and `tell()` methods.
138
+
139
+ Raises:
140
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
141
+ If `path_or_fileobj` is not one of `str`, `Path`, `bytes` or `io.BufferedIOBase`.
142
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
143
+ If `path_or_fileobj` is a `str` or `Path` but not a path to an existing file.
144
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
145
+ If `path_or_fileobj` is a `io.BufferedIOBase` but it doesn't support both
146
+ `seek()` and `tell()`.
147
+ """
148
+
149
+ path_in_repo: str
150
+ path_or_fileobj: Union[str, Path, bytes, BinaryIO]
151
+ upload_info: UploadInfo = field(init=False, repr=False)
152
+
153
+ # Internal attributes
154
+
155
+ # set to "lfs" or "regular" once known
156
+ _upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None)
157
+
158
+ # set to True if .gitignore rules prevent the file from being uploaded as LFS
159
+ # (server-side check)
160
+ _should_ignore: Optional[bool] = field(init=False, repr=False, default=None)
161
+
162
+ # set to the remote OID of the file if it has already been uploaded
163
+ # useful to determine if a commit will be empty or not
164
+ _remote_oid: Optional[str] = field(init=False, repr=False, default=None)
165
+
166
+ # set to True once the file has been uploaded as LFS
167
+ _is_uploaded: bool = field(init=False, repr=False, default=False)
168
+
169
+ # set to True once the file has been committed
170
+ _is_committed: bool = field(init=False, repr=False, default=False)
171
+
172
+ def __post_init__(self) -> None:
173
+ """Validates `path_or_fileobj` and compute `upload_info`."""
174
+ self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
175
+
176
+ # Validate `path_or_fileobj` value
177
+ if isinstance(self.path_or_fileobj, Path):
178
+ self.path_or_fileobj = str(self.path_or_fileobj)
179
+ if isinstance(self.path_or_fileobj, str):
180
+ path_or_fileobj = os.path.normpath(os.path.expanduser(self.path_or_fileobj))
181
+ if not os.path.isfile(path_or_fileobj):
182
+ raise ValueError(f"Provided path: '{path_or_fileobj}' is not a file on the local file system")
183
+ elif not isinstance(self.path_or_fileobj, (io.BufferedIOBase, bytes)):
184
+ # ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode
185
+ raise ValueError(
186
+ "path_or_fileobj must be either an instance of str, bytes or"
187
+ " io.BufferedIOBase. If you passed a file-like object, make sure it is"
188
+ " in binary mode."
189
+ )
190
+ if isinstance(self.path_or_fileobj, io.BufferedIOBase):
191
+ try:
192
+ self.path_or_fileobj.tell()
193
+ self.path_or_fileobj.seek(0, os.SEEK_CUR)
194
+ except (OSError, AttributeError) as exc:
195
+ raise ValueError(
196
+ "path_or_fileobj is a file-like object but does not implement seek() and tell()"
197
+ ) from exc
198
+
199
+ # Compute "upload_info" attribute
200
+ if isinstance(self.path_or_fileobj, str):
201
+ self.upload_info = UploadInfo.from_path(self.path_or_fileobj)
202
+ elif isinstance(self.path_or_fileobj, bytes):
203
+ self.upload_info = UploadInfo.from_bytes(self.path_or_fileobj)
204
+ else:
205
+ self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)
206
+
207
+ @contextmanager
208
+ def as_file(self, with_tqdm: bool = False) -> Iterator[BinaryIO]:
209
+ """
210
+ A context manager that yields a file-like object allowing to read the underlying
211
+ data behind `path_or_fileobj`.
212
+
213
+ Args:
214
+ with_tqdm (`bool`, *optional*, defaults to `False`):
215
+ If True, iterating over the file object will display a progress bar. Only
216
+ works if the file-like object is a path to a file. Pure bytes and buffers
217
+ are not supported.
218
+
219
+ Example:
220
+
221
+ ```python
222
+ >>> operation = CommitOperationAdd(
223
+ ... path_in_repo="remote/dir/weights.h5",
224
+ ... path_or_fileobj="./local/weights.h5",
225
+ ... )
226
+ CommitOperationAdd(path_in_repo='remote/dir/weights.h5', path_or_fileobj='./local/weights.h5')
227
+
228
+ >>> with operation.as_file() as file:
229
+ ... content = file.read()
230
+
231
+ >>> with operation.as_file(with_tqdm=True) as file:
232
+ ... while True:
233
+ ... data = file.read(1024)
234
+ ... if not data:
235
+ ... break
236
+ config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
237
+
238
+ >>> with operation.as_file(with_tqdm=True) as file:
239
+ ... requests.put(..., data=file)
240
+ config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
241
+ ```
242
+ """
243
+ if isinstance(self.path_or_fileobj, str) or isinstance(self.path_or_fileobj, Path):
244
+ if with_tqdm:
245
+ with tqdm_stream_file(self.path_or_fileobj) as file:
246
+ yield file
247
+ else:
248
+ with open(self.path_or_fileobj, "rb") as file:
249
+ yield file
250
+ elif isinstance(self.path_or_fileobj, bytes):
251
+ yield io.BytesIO(self.path_or_fileobj)
252
+ elif isinstance(self.path_or_fileobj, io.BufferedIOBase):
253
+ prev_pos = self.path_or_fileobj.tell()
254
+ yield self.path_or_fileobj
255
+ self.path_or_fileobj.seek(prev_pos, io.SEEK_SET)
256
+
257
+ def b64content(self) -> bytes:
258
+ """
259
+ The base64-encoded content of `path_or_fileobj`
260
+
261
+ Returns: `bytes`
262
+ """
263
+ with self.as_file() as file:
264
+ return base64.b64encode(file.read())
265
+
266
+ @property
267
+ def _local_oid(self) -> Optional[str]:
268
+ """Return the OID of the local file.
269
+
270
+ This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
271
+ If the file did not change, we won't upload it again to prevent empty commits.
272
+
273
+ For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
274
+ For regular files, the OID corresponds to the SHA1 of the file content.
275
+ Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
276
+ pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
277
+ and more convenient client-side.
278
+ """
279
+ if self._upload_mode is None:
280
+ return None
281
+ elif self._upload_mode == "lfs":
282
+ return self.upload_info.sha256.hex()
283
+ else:
284
+ # Regular file => compute sha1
285
+ # => no need to read by chunk since the file is guaranteed to be <=5MB.
286
+ with self.as_file() as file:
287
+ return sha.git_hash(file.read())
288
+
289
+
290
+ def _validate_path_in_repo(path_in_repo: str) -> str:
291
+ # Validate `path_in_repo` value to prevent a server-side issue
292
+ if path_in_repo.startswith("/"):
293
+ path_in_repo = path_in_repo[1:]
294
+ if path_in_repo == "." or path_in_repo == ".." or path_in_repo.startswith("../"):
295
+ raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
296
+ if path_in_repo.startswith("./"):
297
+ path_in_repo = path_in_repo[2:]
298
+ for forbidden in FORBIDDEN_FOLDERS:
299
+ if any(part == forbidden for part in path_in_repo.split("/")):
300
+ raise ValueError(
301
+ f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:"
302
+ f" '{path_in_repo}')."
303
+ )
304
+ return path_in_repo
305
+
306
+
307
+ CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
308
+
309
+
310
+ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
311
+ """
312
+ Warn user when a list of operations is expected to overwrite itself in a single
313
+ commit.
314
+
315
+ Rules:
316
+ - If a filepath is updated by multiple `CommitOperationAdd` operations, a warning
317
+ message is triggered.
318
+ - If a filepath is updated at least once by a `CommitOperationAdd` and then deleted
319
+ by a `CommitOperationDelete`, a warning is triggered.
320
+ - If a `CommitOperationDelete` deletes a filepath that is then updated by a
321
+ `CommitOperationAdd`, no warning is triggered. This is usually useless (no need to
322
+ delete before upload) but can happen if a user deletes an entire folder and then
323
+ add new files to it.
324
+ """
325
+ nb_additions_per_path: Dict[str, int] = defaultdict(int)
326
+ for operation in operations:
327
+ path_in_repo = operation.path_in_repo
328
+ if isinstance(operation, CommitOperationAdd):
329
+ if nb_additions_per_path[path_in_repo] > 0:
330
+ warnings.warn(
331
+ "About to update multiple times the same file in the same commit:"
332
+ f" '{path_in_repo}'. This can cause undesired inconsistencies in"
333
+ " your repo."
334
+ )
335
+ nb_additions_per_path[path_in_repo] += 1
336
+ for parent in PurePosixPath(path_in_repo).parents:
337
+ # Also keep track of number of updated files per folder
338
+ # => warns if deleting a folder overwrite some contained files
339
+ nb_additions_per_path[str(parent)] += 1
340
+ if isinstance(operation, CommitOperationDelete):
341
+ if nb_additions_per_path[str(PurePosixPath(path_in_repo))] > 0:
342
+ if operation.is_folder:
343
+ warnings.warn(
344
+ "About to delete a folder containing files that have just been"
345
+ f" updated within the same commit: '{path_in_repo}'. This can"
346
+ " cause undesired inconsistencies in your repo."
347
+ )
348
+ else:
349
+ warnings.warn(
350
+ "About to delete a file that have just been updated within the"
351
+ f" same commit: '{path_in_repo}'. This can cause undesired"
352
+ " inconsistencies in your repo."
353
+ )
354
+
355
+
356
+ @validate_hf_hub_args
357
+ def _upload_files(
358
+ *,
359
+ additions: List[CommitOperationAdd],
360
+ repo_type: str,
361
+ repo_id: str,
362
+ headers: Dict[str, str],
363
+ endpoint: Optional[str] = None,
364
+ num_threads: int = 5,
365
+ revision: Optional[str] = None,
366
+ create_pr: Optional[bool] = None,
367
+ ):
368
+ """
369
+ Negotiates per-file transfer (LFS vs Xet) and uploads in batches.
370
+ """
371
+ xet_additions: List[CommitOperationAdd] = []
372
+ lfs_actions: List[Dict] = []
373
+ lfs_oid2addop: Dict[str, CommitOperationAdd] = {}
374
+
375
+ for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
376
+ chunk_list = [op for op in chunk]
377
+
378
+ transfers: List[str] = ["basic", "multipart"]
379
+ has_buffered_io_data = any(isinstance(op.path_or_fileobj, io.BufferedIOBase) for op in chunk_list)
380
+ if is_xet_available():
381
+ if not has_buffered_io_data:
382
+ transfers.append("xet")
383
+ else:
384
+ logger.warning(
385
+ "Uploading files as a binary IO buffer is not supported by Xet Storage. "
386
+ "Falling back to HTTP upload."
387
+ )
388
+
389
+ actions_chunk, errors_chunk, chosen_transfer = post_lfs_batch_info(
390
+ upload_infos=[op.upload_info for op in chunk_list],
391
+ repo_id=repo_id,
392
+ repo_type=repo_type,
393
+ revision=revision,
394
+ endpoint=endpoint,
395
+ headers=headers,
396
+ token=None, # already passed in 'headers'
397
+ transfers=transfers,
398
+ )
399
+ if errors_chunk:
400
+ message = "\n".join(
401
+ [
402
+ f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
403
+ for err in errors_chunk
404
+ ]
405
+ )
406
+ raise ValueError(f"LFS batch API returned errors:\n{message}")
407
+
408
+ # If server returns a transfer we didn't offer (e.g "xet" while uploading from BytesIO),
409
+ # fall back to LFS for this chunk.
410
+ if chosen_transfer == "xet" and ("xet" in transfers):
411
+ xet_additions.extend(chunk_list)
412
+ else:
413
+ lfs_actions.extend(actions_chunk)
414
+ for op in chunk_list:
415
+ lfs_oid2addop[op.upload_info.sha256.hex()] = op
416
+
417
+ if len(lfs_actions) > 0:
418
+ _upload_lfs_files(
419
+ actions=lfs_actions,
420
+ oid2addop=lfs_oid2addop,
421
+ headers=headers,
422
+ endpoint=endpoint,
423
+ num_threads=num_threads,
424
+ )
425
+
426
+ if len(xet_additions) > 0:
427
+ _upload_xet_files(
428
+ additions=xet_additions,
429
+ repo_type=repo_type,
430
+ repo_id=repo_id,
431
+ headers=headers,
432
+ endpoint=endpoint,
433
+ revision=revision,
434
+ create_pr=create_pr,
435
+ )
436
+
437
+
438
+ @validate_hf_hub_args
439
+ def _upload_lfs_files(
440
+ *,
441
+ actions: List[Dict],
442
+ oid2addop: Dict[str, CommitOperationAdd],
443
+ headers: Dict[str, str],
444
+ endpoint: Optional[str] = None,
445
+ num_threads: int = 5,
446
+ ):
447
+ """
448
+ Uploads the content of `additions` to the Hub using the large file storage protocol.
449
+
450
+ Relevant external documentation:
451
+ - LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
452
+
453
+ Args:
454
+ actions (`List[Dict]`):
455
+ LFS batch actions returned by the server.
456
+ oid2addop (`Dict[str, CommitOperationAdd]`):
457
+ A dictionary mapping the OID of the file to the corresponding `CommitOperationAdd` object.
458
+ headers (`Dict[str, str]`):
459
+ Headers to use for the request, including authorization headers and user agent.
460
+ endpoint (`str`, *optional*):
461
+ The endpoint to use for the request. Defaults to `constants.ENDPOINT`.
462
+ num_threads (`int`, *optional*):
463
+ The number of concurrent threads to use when uploading. Defaults to 5.
464
+
465
+ Raises:
466
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
467
+ If an upload failed for any reason
468
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
469
+ Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
470
+ repo_id (`str`):
471
+ A namespace (user or an organization) and a repo name separated
472
+ by a `/`.
473
+ headers (`Dict[str, str]`):
474
+ Headers to use for the request, including authorization headers and user agent.
475
+ num_threads (`int`, *optional*):
476
+ The number of concurrent threads to use when uploading. Defaults to 5.
477
+ revision (`str`, *optional*):
478
+ The git revision to upload to.
479
+
480
+ Raises:
481
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
482
+ If an upload failed for any reason
483
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
484
+ If the server returns malformed responses
485
+ [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
486
+ If the LFS batch endpoint returned an HTTP error.
487
+ """
488
+ # Filter out files already present upstream
489
+ filtered_actions = []
490
+ for action in actions:
491
+ if action.get("actions") is None:
492
+ logger.debug(
493
+ f"Content of file {oid2addop[action['oid']].path_in_repo} is already present upstream - skipping upload."
494
+ )
495
+ else:
496
+ filtered_actions.append(action)
497
+
498
+ # Upload according to server-provided actions
499
+ def _wrapped_lfs_upload(batch_action) -> None:
500
+ try:
501
+ operation = oid2addop[batch_action["oid"]]
502
+ lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers, endpoint=endpoint)
503
+ except Exception as exc:
504
+ raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
505
+
506
+ if constants.HF_HUB_ENABLE_HF_TRANSFER:
507
+ logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
508
+ for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
509
+ _wrapped_lfs_upload(action)
510
+ elif len(filtered_actions) == 1:
511
+ logger.debug("Uploading 1 LFS file to the Hub")
512
+ _wrapped_lfs_upload(filtered_actions[0])
513
+ else:
514
+ logger.debug(
515
+ f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
516
+ )
517
+ thread_map(
518
+ _wrapped_lfs_upload,
519
+ filtered_actions,
520
+ desc=f"Upload {len(filtered_actions)} LFS files",
521
+ max_workers=num_threads,
522
+ tqdm_class=hf_tqdm,
523
+ )
524
+
525
+
526
+ @validate_hf_hub_args
527
+ def _upload_xet_files(
528
+ *,
529
+ additions: List[CommitOperationAdd],
530
+ repo_type: str,
531
+ repo_id: str,
532
+ headers: Dict[str, str],
533
+ endpoint: Optional[str] = None,
534
+ revision: Optional[str] = None,
535
+ create_pr: Optional[bool] = None,
536
+ ):
537
+ """
538
+ Uploads the content of `additions` to the Hub using the xet storage protocol.
539
+ This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
540
+
541
+ Args:
542
+ additions (`List` of `CommitOperationAdd`):
543
+ The files to be uploaded.
544
+ repo_type (`str`):
545
+ Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
546
+ repo_id (`str`):
547
+ A namespace (user or an organization) and a repo name separated
548
+ by a `/`.
549
+ headers (`Dict[str, str]`):
550
+ Headers to use for the request, including authorization headers and user agent.
551
+ endpoint: (`str`, *optional*):
552
+ The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
553
+ revision (`str`, *optional*):
554
+ The git revision to upload to.
555
+ create_pr (`bool`, *optional*):
556
+ Whether or not to create a Pull Request with that commit.
557
+
558
+ Raises:
559
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
560
+ If an upload failed for any reason.
561
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
562
+ If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
563
+ [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
564
+ If the LFS batch endpoint returned an HTTP error.
565
+
566
+ **How it works:**
567
+ The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
568
+ for efficient storage and transfer.
569
+
570
+ `hf_xet.upload_files` manages uploading files by:
571
+ - Taking a list of file paths to upload
572
+ - Breaking files into smaller chunks for efficient storage
573
+ - Avoiding duplicate storage by recognizing identical chunks across files
574
+ - Connecting to a storage server (CAS server) that manages these chunks
575
+
576
+ The upload process works like this:
577
+ 1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
578
+ 2. Process files in parallel (up to 8 files at once):
579
+ 2.1. Read the file content.
580
+ 2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
581
+ 2.3. For each chunk:
582
+ - Check if it already exists in storage.
583
+ - Skip uploading chunks that already exist.
584
+ 2.4. Group chunks into larger blocks for efficient transfer.
585
+ 2.5. Upload these blocks to the storage server.
586
+ 2.6. Create and upload information about how the file is structured.
587
+ 3. Return reference files that contain information about the uploaded files, which can be used later to download them.
588
+ """
589
+ if len(additions) == 0:
590
+ return
591
+
592
+ # at this point, we know that hf_xet is installed
593
+ from hf_xet import upload_bytes, upload_files
594
+
595
+ from .utils._xet_progress_reporting import XetProgressReporter
596
+
597
+ try:
598
+ xet_connection_info = fetch_xet_connection_info_from_repo_info(
599
+ token_type=XetTokenType.WRITE,
600
+ repo_id=repo_id,
601
+ repo_type=repo_type,
602
+ revision=revision,
603
+ headers=headers,
604
+ endpoint=endpoint,
605
+ params={"create_pr": "1"} if create_pr else None,
606
+ )
607
+ except HfHubHTTPError as e:
608
+ if e.response.status_code == 401:
609
+ raise XetAuthorizationError(
610
+ f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
611
+ f"Please check that you have configured your access token with write access to the repo."
612
+ ) from e
613
+ raise
614
+
615
+ xet_endpoint = xet_connection_info.endpoint
616
+ access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
617
+
618
+ def token_refresher() -> Tuple[str, int]:
619
+ new_xet_connection = fetch_xet_connection_info_from_repo_info(
620
+ token_type=XetTokenType.WRITE,
621
+ repo_id=repo_id,
622
+ repo_type=repo_type,
623
+ revision=revision,
624
+ headers=headers,
625
+ endpoint=endpoint,
626
+ params={"create_pr": "1"} if create_pr else None,
627
+ )
628
+ if new_xet_connection is None:
629
+ raise XetRefreshTokenError("Failed to refresh xet token")
630
+ return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
631
+
632
+ if not are_progress_bars_disabled():
633
+ progress = XetProgressReporter()
634
+ progress_callback = progress.update_progress
635
+ else:
636
+ progress, progress_callback = None, None
637
+
638
+ try:
639
+ all_bytes_ops = [op for op in additions if isinstance(op.path_or_fileobj, bytes)]
640
+ all_paths_ops = [op for op in additions if isinstance(op.path_or_fileobj, (str, Path))]
641
+
642
+ if len(all_paths_ops) > 0:
643
+ all_paths = [str(op.path_or_fileobj) for op in all_paths_ops]
644
+ upload_files(
645
+ all_paths,
646
+ xet_endpoint,
647
+ access_token_info,
648
+ token_refresher,
649
+ progress_callback,
650
+ repo_type,
651
+ )
652
+
653
+ if len(all_bytes_ops) > 0:
654
+ all_bytes = [op.path_or_fileobj for op in all_bytes_ops]
655
+ upload_bytes(
656
+ all_bytes,
657
+ xet_endpoint,
658
+ access_token_info,
659
+ token_refresher,
660
+ progress_callback,
661
+ repo_type,
662
+ )
663
+
664
+ finally:
665
+ if progress is not None:
666
+ progress.close(False)
667
+
668
+ return
669
+
670
+
671
+ def _validate_preupload_info(preupload_info: dict):
672
+ files = preupload_info.get("files")
673
+ if not isinstance(files, list):
674
+ raise ValueError("preupload_info is improperly formatted")
675
+ for file_info in files:
676
+ if not (
677
+ isinstance(file_info, dict)
678
+ and isinstance(file_info.get("path"), str)
679
+ and isinstance(file_info.get("uploadMode"), str)
680
+ and (file_info["uploadMode"] in ("lfs", "regular"))
681
+ ):
682
+ raise ValueError("preupload_info is improperly formatted:")
683
+ return preupload_info
684
+
685
+
686
+ @validate_hf_hub_args
687
+ def _fetch_upload_modes(
688
+ additions: Iterable[CommitOperationAdd],
689
+ repo_type: str,
690
+ repo_id: str,
691
+ headers: Dict[str, str],
692
+ revision: str,
693
+ endpoint: Optional[str] = None,
694
+ create_pr: bool = False,
695
+ gitignore_content: Optional[str] = None,
696
+ ) -> None:
697
+ """
698
+ Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
699
+ as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
700
+
701
+ Args:
702
+ additions (`Iterable` of :class:`CommitOperationAdd`):
703
+ Iterable of :class:`CommitOperationAdd` describing the files to
704
+ upload to the Hub.
705
+ repo_type (`str`):
706
+ Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
707
+ repo_id (`str`):
708
+ A namespace (user or an organization) and a repo name separated
709
+ by a `/`.
710
+ headers (`Dict[str, str]`):
711
+ Headers to use for the request, including authorization headers and user agent.
712
+ revision (`str`):
713
+ The git revision to upload the files to. Can be any valid git revision.
714
+ gitignore_content (`str`, *optional*):
715
+ The content of the `.gitignore` file to know which files should be ignored. The order of priority
716
+ is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
717
+ in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
718
+ (if any).
719
+ Raises:
720
+ [`~utils.HfHubHTTPError`]
721
+ If the Hub API returned an error.
722
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
723
+ If the Hub API response is improperly formatted.
724
+ """
725
+ endpoint = endpoint if endpoint is not None else constants.ENDPOINT
726
+
727
+ # Fetch upload mode (LFS or regular) chunk by chunk.
728
+ upload_modes: Dict[str, UploadMode] = {}
729
+ should_ignore_info: Dict[str, bool] = {}
730
+ oid_info: Dict[str, Optional[str]] = {}
731
+
732
+ for chunk in chunk_iterable(additions, 256):
733
+ payload: Dict = {
734
+ "files": [
735
+ {
736
+ "path": op.path_in_repo,
737
+ "sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
738
+ "size": op.upload_info.size,
739
+ }
740
+ for op in chunk
741
+ ]
742
+ }
743
+ if gitignore_content is not None:
744
+ payload["gitIgnore"] = gitignore_content
745
+
746
+ resp = get_session().post(
747
+ f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
748
+ json=payload,
749
+ headers=headers,
750
+ params={"create_pr": "1"} if create_pr else None,
751
+ )
752
+ hf_raise_for_status(resp)
753
+ preupload_info = _validate_preupload_info(resp.json())
754
+ upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
755
+ should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
756
+ oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})
757
+
758
+ # Set upload mode for each addition operation
759
+ for addition in additions:
760
+ addition._upload_mode = upload_modes[addition.path_in_repo]
761
+ addition._should_ignore = should_ignore_info[addition.path_in_repo]
762
+ addition._remote_oid = oid_info[addition.path_in_repo]
763
+
764
+ # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
765
+ # => empty files are uploaded as "regular" to still allow users to commit them.
766
+ for addition in additions:
767
+ if addition.upload_info.size == 0:
768
+ addition._upload_mode = "regular"
769
+
770
+
771
+ @validate_hf_hub_args
772
+ def _fetch_files_to_copy(
773
+ copies: Iterable[CommitOperationCopy],
774
+ repo_type: str,
775
+ repo_id: str,
776
+ headers: Dict[str, str],
777
+ revision: str,
778
+ endpoint: Optional[str] = None,
779
+ ) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
780
+ """
781
+ Fetch information about the files to copy.
782
+
783
+ For LFS files, we only need their metadata (file size and sha256) while for regular files
784
+ we need to download the raw content from the Hub.
785
+
786
+ Args:
787
+ copies (`Iterable` of :class:`CommitOperationCopy`):
788
+ Iterable of :class:`CommitOperationCopy` describing the files to
789
+ copy on the Hub.
790
+ repo_type (`str`):
791
+ Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
792
+ repo_id (`str`):
793
+ A namespace (user or an organization) and a repo name separated
794
+ by a `/`.
795
+ headers (`Dict[str, str]`):
796
+ Headers to use for the request, including authorization headers and user agent.
797
+ revision (`str`):
798
+ The git revision to upload the files to. Can be any valid git revision.
799
+
800
+ Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
801
+ Key is the file path and revision of the file to copy.
802
+ Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
803
+
804
+ Raises:
805
+ [`~utils.HfHubHTTPError`]
806
+ If the Hub API returned an error.
807
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
808
+ If the Hub API response is improperly formatted.
809
+ """
810
+ from .hf_api import HfApi, RepoFolder
811
+
812
+ hf_api = HfApi(endpoint=endpoint, headers=headers)
813
+ files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
814
+ # Store (path, revision) -> oid mapping
815
+ oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
816
+ # 1. Fetch OIDs for destination paths in batches.
817
+ dest_paths = [op.path_in_repo for op in copies]
818
+ for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
819
+ dest_repo_files = hf_api.get_paths_info(
820
+ repo_id=repo_id,
821
+ paths=dest_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
822
+ revision=revision,
823
+ repo_type=repo_type,
824
+ )
825
+ for file in dest_repo_files:
826
+ if not isinstance(file, RepoFolder):
827
+ oid_info[(file.path, revision)] = file.blob_id
828
+
829
+ # 2. Group by source revision and fetch source file info in batches.
830
+ for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
831
+ operations = list(operations) # type: ignore
832
+ src_paths = [op.src_path_in_repo for op in operations]
833
+ for offset in range(0, len(src_paths), FETCH_LFS_BATCH_SIZE):
834
+ src_repo_files = hf_api.get_paths_info(
835
+ repo_id=repo_id,
836
+ paths=src_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
837
+ revision=src_revision or revision,
838
+ repo_type=repo_type,
839
+ )
840
+
841
+ for src_repo_file in src_repo_files:
842
+ if isinstance(src_repo_file, RepoFolder):
843
+ raise NotImplementedError("Copying a folder is not implemented.")
844
+ oid_info[(src_repo_file.path, src_revision)] = src_repo_file.blob_id
845
+ # If it's an LFS file, store the RepoFile object. Otherwise, download raw bytes.
846
+ if src_repo_file.lfs:
847
+ files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
848
+ else:
849
+ # TODO: (optimization) download regular files to copy concurrently
850
+ url = hf_hub_url(
851
+ endpoint=endpoint,
852
+ repo_type=repo_type,
853
+ repo_id=repo_id,
854
+ revision=src_revision or revision,
855
+ filename=src_repo_file.path,
856
+ )
857
+ response = get_session().get(url, headers=headers)
858
+ hf_raise_for_status(response)
859
+ files_to_copy[(src_repo_file.path, src_revision)] = response.content
860
+ # 3. Ensure all operations found a corresponding file in the Hub
861
+ # and track src/dest OIDs for each operation.
862
+ for operation in operations:
863
+ if (operation.src_path_in_repo, src_revision) not in files_to_copy:
864
+ raise EntryNotFoundError(
865
+ f"Cannot copy {operation.src_path_in_repo} at revision "
866
+ f"{src_revision or revision}: file is missing on repo."
867
+ )
868
+ operation._src_oid = oid_info.get((operation.src_path_in_repo, operation.src_revision))
869
+ operation._dest_oid = oid_info.get((operation.path_in_repo, revision))
870
+ return files_to_copy
871
+
872
+
873
+ def _prepare_commit_payload(
874
+ operations: Iterable[CommitOperation],
875
+ files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
876
+ commit_message: str,
877
+ commit_description: Optional[str] = None,
878
+ parent_commit: Optional[str] = None,
879
+ ) -> Iterable[Dict[str, Any]]:
880
+ """
881
+ Builds the payload to POST to the `/commit` API of the Hub.
882
+
883
+ Payload is returned as an iterator so that it can be streamed as a ndjson in the
884
+ POST request.
885
+
886
+ For more information, see:
887
+ - https://github.com/huggingface/huggingface_hub/issues/1085#issuecomment-1265208073
888
+ - http://ndjson.org/
889
+ """
890
+ commit_description = commit_description if commit_description is not None else ""
891
+
892
+ # 1. Send a header item with the commit metadata
893
+ header_value = {"summary": commit_message, "description": commit_description}
894
+ if parent_commit is not None:
895
+ header_value["parentCommit"] = parent_commit
896
+ yield {"key": "header", "value": header_value}
897
+
898
+ nb_ignored_files = 0
899
+
900
+ # 2. Send operations, one per line
901
+ for operation in operations:
902
+ # Skip ignored files
903
+ if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
904
+ logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
905
+ nb_ignored_files += 1
906
+ continue
907
+
908
+ # 2.a. Case adding a regular file
909
+ if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
910
+ yield {
911
+ "key": "file",
912
+ "value": {
913
+ "content": operation.b64content().decode(),
914
+ "path": operation.path_in_repo,
915
+ "encoding": "base64",
916
+ },
917
+ }
918
+ # 2.b. Case adding an LFS file
919
+ elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs":
920
+ yield {
921
+ "key": "lfsFile",
922
+ "value": {
923
+ "path": operation.path_in_repo,
924
+ "algo": "sha256",
925
+ "oid": operation.upload_info.sha256.hex(),
926
+ "size": operation.upload_info.size,
927
+ },
928
+ }
929
+ # 2.c. Case deleting a file or folder
930
+ elif isinstance(operation, CommitOperationDelete):
931
+ yield {
932
+ "key": "deletedFolder" if operation.is_folder else "deletedFile",
933
+ "value": {"path": operation.path_in_repo},
934
+ }
935
+ # 2.d. Case copying a file or folder
936
+ elif isinstance(operation, CommitOperationCopy):
937
+ file_to_copy = files_to_copy[(operation.src_path_in_repo, operation.src_revision)]
938
+ if isinstance(file_to_copy, bytes):
939
+ yield {
940
+ "key": "file",
941
+ "value": {
942
+ "content": base64.b64encode(file_to_copy).decode(),
943
+ "path": operation.path_in_repo,
944
+ "encoding": "base64",
945
+ },
946
+ }
947
+ elif file_to_copy.lfs:
948
+ yield {
949
+ "key": "lfsFile",
950
+ "value": {
951
+ "path": operation.path_in_repo,
952
+ "algo": "sha256",
953
+ "oid": file_to_copy.lfs.sha256,
954
+ },
955
+ }
956
+ else:
957
+ raise ValueError(
958
+ "Malformed files_to_copy (should be raw file content as bytes or RepoFile objects with LFS info."
959
+ )
960
+ # 2.e. Never expected to happen
961
+ else:
962
+ raise ValueError(
963
+ f"Unknown operation to commit. Operation: {operation}. Upload mode:"
964
+ f" {getattr(operation, '_upload_mode', None)}"
965
+ )
966
+
967
+ if nb_ignored_files > 0:
968
+ logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).")
venv/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import atexit
2
+ import logging
3
+ import os
4
+ import time
5
+ from concurrent.futures import Future
6
+ from dataclasses import dataclass
7
+ from io import SEEK_END, SEEK_SET, BytesIO
8
+ from pathlib import Path
9
+ from threading import Lock, Thread
10
+ from typing import Dict, List, Optional, Union
11
+
12
+ from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
13
+ from .utils import filter_repo_objects
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class _FileToUpload:
21
+ """Temporary dataclass to store info about files to upload. Not meant to be used directly."""
22
+
23
+ local_path: Path
24
+ path_in_repo: str
25
+ size_limit: int
26
+ last_modified: float
27
+
28
+
29
+ class CommitScheduler:
30
+ """
31
+ Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
32
+
33
+ The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
34
+ properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
35
+ with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
36
+ to learn more about how to use it.
37
+
38
+ Args:
39
+ repo_id (`str`):
40
+ The id of the repo to commit to.
41
+ folder_path (`str` or `Path`):
42
+ Path to the local folder to upload regularly.
43
+ every (`int` or `float`, *optional*):
44
+ The number of minutes between each commit. Defaults to 5 minutes.
45
+ path_in_repo (`str`, *optional*):
46
+ Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
47
+ of the repository.
48
+ repo_type (`str`, *optional*):
49
+ The type of the repo to commit to. Defaults to `model`.
50
+ revision (`str`, *optional*):
51
+ The revision of the repo to commit to. Defaults to `main`.
52
+ private (`bool`, *optional*):
53
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
54
+ token (`str`, *optional*):
55
+ The token to use to commit to the repo. Defaults to the token saved on the machine.
56
+ allow_patterns (`List[str]` or `str`, *optional*):
57
+ If provided, only files matching at least one pattern are uploaded.
58
+ ignore_patterns (`List[str]` or `str`, *optional*):
59
+ If provided, files matching any of the patterns are not uploaded.
60
+ squash_history (`bool`, *optional*):
61
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
62
+ useful to avoid degraded performances on the repo when it grows too large.
63
+ hf_api (`HfApi`, *optional*):
64
+ The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
65
+
66
+ Example:
67
+ ```py
68
+ >>> from pathlib import Path
69
+ >>> from huggingface_hub import CommitScheduler
70
+
71
+ # Scheduler uploads every 10 minutes
72
+ >>> csv_path = Path("watched_folder/data.csv")
73
+ >>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
74
+
75
+ >>> with csv_path.open("a") as f:
76
+ ... f.write("first line")
77
+
78
+ # Some time later (...)
79
+ >>> with csv_path.open("a") as f:
80
+ ... f.write("second line")
81
+ ```
82
+
83
+ Example using a context manager:
84
+ ```py
85
+ >>> from pathlib import Path
86
+ >>> from huggingface_hub import CommitScheduler
87
+
88
+ >>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
89
+ ... csv_path = Path("watched_folder/data.csv")
90
+ ... with csv_path.open("a") as f:
91
+ ... f.write("first line")
92
+ ... (...)
93
+ ... with csv_path.open("a") as f:
94
+ ... f.write("second line")
95
+
96
+ # Scheduler is now stopped and last commit have been triggered
97
+ ```
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ *,
103
+ repo_id: str,
104
+ folder_path: Union[str, Path],
105
+ every: Union[int, float] = 5,
106
+ path_in_repo: Optional[str] = None,
107
+ repo_type: Optional[str] = None,
108
+ revision: Optional[str] = None,
109
+ private: Optional[bool] = None,
110
+ token: Optional[str] = None,
111
+ allow_patterns: Optional[Union[List[str], str]] = None,
112
+ ignore_patterns: Optional[Union[List[str], str]] = None,
113
+ squash_history: bool = False,
114
+ hf_api: Optional["HfApi"] = None,
115
+ ) -> None:
116
+ self.api = hf_api or HfApi(token=token)
117
+
118
+ # Folder
119
+ self.folder_path = Path(folder_path).expanduser().resolve()
120
+ self.path_in_repo = path_in_repo or ""
121
+ self.allow_patterns = allow_patterns
122
+
123
+ if ignore_patterns is None:
124
+ ignore_patterns = []
125
+ elif isinstance(ignore_patterns, str):
126
+ ignore_patterns = [ignore_patterns]
127
+ self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
128
+
129
+ if self.folder_path.is_file():
130
+ raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.")
131
+ self.folder_path.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Repository
134
+ repo_url = self.api.create_repo(repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True)
135
+ self.repo_id = repo_url.repo_id
136
+ self.repo_type = repo_type
137
+ self.revision = revision
138
+ self.token = token
139
+
140
+ # Keep track of already uploaded files
141
+ self.last_uploaded: Dict[Path, float] = {} # key is local path, value is timestamp
142
+
143
+ # Scheduler
144
+ if not every > 0:
145
+ raise ValueError(f"'every' must be a positive integer, not '{every}'.")
146
+ self.lock = Lock()
147
+ self.every = every
148
+ self.squash_history = squash_history
149
+
150
+ logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
151
+ self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
152
+ self._scheduler_thread.start()
153
+ atexit.register(self._push_to_hub)
154
+
155
+ self.__stopped = False
156
+
157
+ def stop(self) -> None:
158
+ """Stop the scheduler.
159
+
160
+ A stopped scheduler cannot be restarted. Mostly for tests purposes.
161
+ """
162
+ self.__stopped = True
163
+
164
+ def __enter__(self) -> "CommitScheduler":
165
+ return self
166
+
167
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
168
+ # Upload last changes before exiting
169
+ self.trigger().result()
170
+ self.stop()
171
+ return
172
+
173
+ def _run_scheduler(self) -> None:
174
+ """Dumb thread waiting between each scheduled push to Hub."""
175
+ while True:
176
+ self.last_future = self.trigger()
177
+ time.sleep(self.every * 60)
178
+ if self.__stopped:
179
+ break
180
+
181
+ def trigger(self) -> Future:
182
+ """Trigger a `push_to_hub` and return a future.
183
+
184
+ This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
185
+ immediately, without waiting for the next scheduled commit.
186
+ """
187
+ return self.api.run_as_future(self._push_to_hub)
188
+
189
+ def _push_to_hub(self) -> Optional[CommitInfo]:
190
+ if self.__stopped: # If stopped, already scheduled commits are ignored
191
+ return None
192
+
193
+ logger.info("(Background) scheduled commit triggered.")
194
+ try:
195
+ value = self.push_to_hub()
196
+ if self.squash_history:
197
+ logger.info("(Background) squashing repo history.")
198
+ self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
199
+ return value
200
+ except Exception as e:
201
+ logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
202
+ raise
203
+
204
+ def push_to_hub(self) -> Optional[CommitInfo]:
205
+ """
206
+ Push folder to the Hub and return the commit info.
207
+
208
+ > [!WARNING]
209
+ > This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
210
+ > queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
211
+ > issues.
212
+
213
+ The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
214
+ uploads only changed files. If no changes are found, the method returns without committing anything. If you want
215
+ to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
216
+ for example to compress data together in a single file before committing. For more details and examples, check
217
+ out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
218
+ """
219
+ # Check files to upload (with lock)
220
+ with self.lock:
221
+ logger.debug("Listing files to upload for scheduled commit.")
222
+
223
+ # List files from folder (taken from `_prepare_upload_folder_additions`)
224
+ relpath_to_abspath = {
225
+ path.relative_to(self.folder_path).as_posix(): path
226
+ for path in sorted(self.folder_path.glob("**/*")) # sorted to be deterministic
227
+ if path.is_file()
228
+ }
229
+ prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
230
+
231
+ # Filter with pattern + filter out unchanged files + retrieve current file size
232
+ files_to_upload: List[_FileToUpload] = []
233
+ for relpath in filter_repo_objects(
234
+ relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
235
+ ):
236
+ local_path = relpath_to_abspath[relpath]
237
+ stat = local_path.stat()
238
+ if self.last_uploaded.get(local_path) is None or self.last_uploaded[local_path] != stat.st_mtime:
239
+ files_to_upload.append(
240
+ _FileToUpload(
241
+ local_path=local_path,
242
+ path_in_repo=prefix + relpath,
243
+ size_limit=stat.st_size,
244
+ last_modified=stat.st_mtime,
245
+ )
246
+ )
247
+
248
+ # Return if nothing to upload
249
+ if len(files_to_upload) == 0:
250
+ logger.debug("Dropping schedule commit: no changed file to upload.")
251
+ return None
252
+
253
+ # Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
254
+ logger.debug("Removing unchanged files since previous scheduled commit.")
255
+ add_operations = [
256
+ CommitOperationAdd(
257
+ # Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
258
+ path_or_fileobj=PartialFileIO(file_to_upload.local_path, size_limit=file_to_upload.size_limit),
259
+ path_in_repo=file_to_upload.path_in_repo,
260
+ )
261
+ for file_to_upload in files_to_upload
262
+ ]
263
+
264
+ # Upload files (append mode expected - no need for lock)
265
+ logger.debug("Uploading files for scheduled commit.")
266
+ commit_info = self.api.create_commit(
267
+ repo_id=self.repo_id,
268
+ repo_type=self.repo_type,
269
+ operations=add_operations,
270
+ commit_message="Scheduled Commit",
271
+ revision=self.revision,
272
+ )
273
+
274
+ # Successful commit: keep track of the latest "last_modified" for each file
275
+ for file in files_to_upload:
276
+ self.last_uploaded[file.local_path] = file.last_modified
277
+ return commit_info
278
+
279
+
280
+ class PartialFileIO(BytesIO):
281
+ """A file-like object that reads only the first part of a file.
282
+
283
+ Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
284
+ file is uploaded (i.e. the part that was available when the filesystem was first scanned).
285
+
286
+ In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
287
+ disturbance for the user. The object is passed to `CommitOperationAdd`.
288
+
289
+ Only supports `read`, `tell` and `seek` methods.
290
+
291
+ Args:
292
+ file_path (`str` or `Path`):
293
+ Path to the file to read.
294
+ size_limit (`int`):
295
+ The maximum number of bytes to read from the file. If the file is larger than this, only the first part
296
+ will be read (and uploaded).
297
+ """
298
+
299
+ def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
300
+ self._file_path = Path(file_path)
301
+ self._file = self._file_path.open("rb")
302
+ self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
303
+
304
+ def __del__(self) -> None:
305
+ self._file.close()
306
+ return super().__del__()
307
+
308
+ def __repr__(self) -> str:
309
+ return f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
310
+
311
+ def __len__(self) -> int:
312
+ return self._size_limit
313
+
314
+ def __getattribute__(self, name: str):
315
+ if name.startswith("_") or name in ("read", "tell", "seek"): # only 3 public methods supported
316
+ return super().__getattribute__(name)
317
+ raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
318
+
319
+ def tell(self) -> int:
320
+ """Return the current file position."""
321
+ return self._file.tell()
322
+
323
+ def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
324
+ """Change the stream position to the given offset.
325
+
326
+ Behavior is the same as a regular file, except that the position is capped to the size limit.
327
+ """
328
+ if __whence == SEEK_END:
329
+ # SEEK_END => set from the truncated end
330
+ __offset = len(self) + __offset
331
+ __whence = SEEK_SET
332
+
333
+ pos = self._file.seek(__offset, __whence)
334
+ if pos > self._size_limit:
335
+ return self._file.seek(self._size_limit)
336
+ return pos
337
+
338
+ def read(self, __size: Optional[int] = -1) -> bytes:
339
+ """Read at most `__size` bytes from the file.
340
+
341
+ Behavior is the same as a regular file, except that it is capped to the size limit.
342
+ """
343
+ current = self._file.tell()
344
+ if __size is None or __size < 0:
345
+ # Read until file limit
346
+ truncated_size = self._size_limit - current
347
+ else:
348
+ # Read until file limit or __size
349
+ truncated_size = min(__size, self._size_limit - current)
350
+ return self._file.read(truncated_size)
venv/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import TYPE_CHECKING, Dict, Optional, Union
6
+
7
+ from huggingface_hub.errors import InferenceEndpointError, InferenceEndpointTimeoutError
8
+
9
+ from .utils import get_session, logging, parse_datetime
10
+
11
+
12
+ if TYPE_CHECKING:
13
+ from .hf_api import HfApi
14
+ from .inference._client import InferenceClient
15
+ from .inference._generated._async_client import AsyncInferenceClient
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+
20
+ class InferenceEndpointStatus(str, Enum):
21
+ PENDING = "pending"
22
+ INITIALIZING = "initializing"
23
+ UPDATING = "updating"
24
+ UPDATE_FAILED = "updateFailed"
25
+ RUNNING = "running"
26
+ PAUSED = "paused"
27
+ FAILED = "failed"
28
+ SCALED_TO_ZERO = "scaledToZero"
29
+
30
+
31
+ class InferenceEndpointType(str, Enum):
32
+ PUBlIC = "public"
33
+ PROTECTED = "protected"
34
+ PRIVATE = "private"
35
+
36
+
37
+ @dataclass
38
+ class InferenceEndpoint:
39
+ """
40
+ Contains information about a deployed Inference Endpoint.
41
+
42
+ Args:
43
+ name (`str`):
44
+ The unique name of the Inference Endpoint.
45
+ namespace (`str`):
46
+ The namespace where the Inference Endpoint is located.
47
+ repository (`str`):
48
+ The name of the model repository deployed on this Inference Endpoint.
49
+ status ([`InferenceEndpointStatus`]):
50
+ The current status of the Inference Endpoint.
51
+ url (`str`, *optional*):
52
+ The URL of the Inference Endpoint, if available. Only a deployed Inference Endpoint will have a URL.
53
+ framework (`str`):
54
+ The machine learning framework used for the model.
55
+ revision (`str`):
56
+ The specific model revision deployed on the Inference Endpoint.
57
+ task (`str`):
58
+ The task associated with the deployed model.
59
+ created_at (`datetime.datetime`):
60
+ The timestamp when the Inference Endpoint was created.
61
+ updated_at (`datetime.datetime`):
62
+ The timestamp of the last update of the Inference Endpoint.
63
+ type ([`InferenceEndpointType`]):
64
+ The type of the Inference Endpoint (public, protected, private).
65
+ raw (`Dict`):
66
+ The raw dictionary data returned from the API.
67
+ token (`str` or `bool`, *optional*):
68
+ Authentication token for the Inference Endpoint, if set when requesting the API. Will default to the
69
+ locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server.
70
+
71
+ Example:
72
+ ```python
73
+ >>> from huggingface_hub import get_inference_endpoint
74
+ >>> endpoint = get_inference_endpoint("my-text-to-image")
75
+ >>> endpoint
76
+ InferenceEndpoint(name='my-text-to-image', ...)
77
+
78
+ # Get status
79
+ >>> endpoint.status
80
+ 'running'
81
+ >>> endpoint.url
82
+ 'https://my-text-to-image.region.vendor.endpoints.huggingface.cloud'
83
+
84
+ # Run inference
85
+ >>> endpoint.client.text_to_image(...)
86
+
87
+ # Pause endpoint to save $$$
88
+ >>> endpoint.pause()
89
+
90
+ # ...
91
+ # Resume and wait for deployment
92
+ >>> endpoint.resume()
93
+ >>> endpoint.wait()
94
+ >>> endpoint.client.text_to_image(...)
95
+ ```
96
+ """
97
+
98
+ # Field in __repr__
99
+ name: str = field(init=False)
100
+ namespace: str
101
+ repository: str = field(init=False)
102
+ status: InferenceEndpointStatus = field(init=False)
103
+ health_route: str = field(init=False)
104
+ url: Optional[str] = field(init=False)
105
+
106
+ # Other fields
107
+ framework: str = field(repr=False, init=False)
108
+ revision: str = field(repr=False, init=False)
109
+ task: str = field(repr=False, init=False)
110
+ created_at: datetime = field(repr=False, init=False)
111
+ updated_at: datetime = field(repr=False, init=False)
112
+ type: InferenceEndpointType = field(repr=False, init=False)
113
+
114
+ # Raw dict from the API
115
+ raw: Dict = field(repr=False)
116
+
117
+ # Internal fields
118
+ _token: Union[str, bool, None] = field(repr=False, compare=False)
119
+ _api: "HfApi" = field(repr=False, compare=False)
120
+
121
+ @classmethod
122
+ def from_raw(
123
+ cls, raw: Dict, namespace: str, token: Union[str, bool, None] = None, api: Optional["HfApi"] = None
124
+ ) -> "InferenceEndpoint":
125
+ """Initialize object from raw dictionary."""
126
+ if api is None:
127
+ from .hf_api import HfApi
128
+
129
+ api = HfApi()
130
+ if token is None:
131
+ token = api.token
132
+
133
+ # All other fields are populated in __post_init__
134
+ return cls(raw=raw, namespace=namespace, _token=token, _api=api)
135
+
136
+ def __post_init__(self) -> None:
137
+ """Populate fields from raw dictionary."""
138
+ self._populate_from_raw()
139
+
140
+ @property
141
+ def client(self) -> "InferenceClient":
142
+ """Returns a client to make predictions on this Inference Endpoint.
143
+
144
+ Returns:
145
+ [`InferenceClient`]: an inference client pointing to the deployed endpoint.
146
+
147
+ Raises:
148
+ [`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
149
+ """
150
+ if self.url is None:
151
+ raise InferenceEndpointError(
152
+ "Cannot create a client for this Inference Endpoint as it is not yet deployed. "
153
+ "Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
154
+ )
155
+ from .inference._client import InferenceClient
156
+
157
+ return InferenceClient(
158
+ model=self.url,
159
+ token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
160
+ )
161
+
162
+ @property
163
+ def async_client(self) -> "AsyncInferenceClient":
164
+ """Returns a client to make predictions on this Inference Endpoint.
165
+
166
+ Returns:
167
+ [`AsyncInferenceClient`]: an asyncio-compatible inference client pointing to the deployed endpoint.
168
+
169
+ Raises:
170
+ [`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
171
+ """
172
+ if self.url is None:
173
+ raise InferenceEndpointError(
174
+ "Cannot create a client for this Inference Endpoint as it is not yet deployed. "
175
+ "Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
176
+ )
177
+ from .inference._generated._async_client import AsyncInferenceClient
178
+
179
+ return AsyncInferenceClient(
180
+ model=self.url,
181
+ token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
182
+ )
183
+
184
+ def wait(self, timeout: Optional[int] = None, refresh_every: int = 5) -> "InferenceEndpoint":
185
+ """Wait for the Inference Endpoint to be deployed.
186
+
187
+ Information from the server will be fetched every 1s. If the Inference Endpoint is not deployed after `timeout`
188
+ seconds, a [`InferenceEndpointTimeoutError`] will be raised. The [`InferenceEndpoint`] will be mutated in place with the latest
189
+ data.
190
+
191
+ Args:
192
+ timeout (`int`, *optional*):
193
+ The maximum time to wait for the Inference Endpoint to be deployed, in seconds. If `None`, will wait
194
+ indefinitely.
195
+ refresh_every (`int`, *optional*):
196
+ The time to wait between each fetch of the Inference Endpoint status, in seconds. Defaults to 5s.
197
+
198
+ Returns:
199
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
200
+
201
+ Raises:
202
+ [`InferenceEndpointError`]
203
+ If the Inference Endpoint ended up in a failed state.
204
+ [`InferenceEndpointTimeoutError`]
205
+ If the Inference Endpoint is not deployed after `timeout` seconds.
206
+ """
207
+ if timeout is not None and timeout < 0:
208
+ raise ValueError("`timeout` cannot be negative.")
209
+ if refresh_every <= 0:
210
+ raise ValueError("`refresh_every` must be positive.")
211
+
212
+ start = time.time()
213
+ while True:
214
+ if self.status == InferenceEndpointStatus.FAILED:
215
+ raise InferenceEndpointError(
216
+ f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
217
+ )
218
+ if self.status == InferenceEndpointStatus.UPDATE_FAILED:
219
+ raise InferenceEndpointError(
220
+ f"Inference Endpoint {self.name} failed to update. Please check the logs for more information."
221
+ )
222
+ if self.status == InferenceEndpointStatus.RUNNING and self.url is not None:
223
+ # Verify the endpoint is actually reachable
224
+ _health_url = f"{self.url.rstrip('/')}/{self.health_route.lstrip('/')}"
225
+ response = get_session().get(_health_url, headers=self._api._build_hf_headers(token=self._token))
226
+ if response.status_code == 200:
227
+ logger.info("Inference Endpoint is ready to be used.")
228
+ return self
229
+
230
+ if timeout is not None:
231
+ if time.time() - start > timeout:
232
+ raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
233
+ logger.info(f"Inference Endpoint is not deployed yet ({self.status}). Waiting {refresh_every}s...")
234
+ time.sleep(refresh_every)
235
+ self.fetch()
236
+
237
+ def fetch(self) -> "InferenceEndpoint":
238
+ """Fetch latest information about the Inference Endpoint.
239
+
240
+ Returns:
241
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
242
+ """
243
+ obj = self._api.get_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
244
+ self.raw = obj.raw
245
+ self._populate_from_raw()
246
+ return self
247
+
248
+ def update(
249
+ self,
250
+ *,
251
+ # Compute update
252
+ accelerator: Optional[str] = None,
253
+ instance_size: Optional[str] = None,
254
+ instance_type: Optional[str] = None,
255
+ min_replica: Optional[int] = None,
256
+ max_replica: Optional[int] = None,
257
+ scale_to_zero_timeout: Optional[int] = None,
258
+ # Model update
259
+ repository: Optional[str] = None,
260
+ framework: Optional[str] = None,
261
+ revision: Optional[str] = None,
262
+ task: Optional[str] = None,
263
+ custom_image: Optional[Dict] = None,
264
+ secrets: Optional[Dict[str, str]] = None,
265
+ ) -> "InferenceEndpoint":
266
+ """Update the Inference Endpoint.
267
+
268
+ This method allows the update of either the compute configuration, the deployed model, or both. All arguments are
269
+ optional but at least one must be provided.
270
+
271
+ This is an alias for [`HfApi.update_inference_endpoint`]. The current object is mutated in place with the
272
+ latest data from the server.
273
+
274
+ Args:
275
+ accelerator (`str`, *optional*):
276
+ The hardware accelerator to be used for inference (e.g. `"cpu"`).
277
+ instance_size (`str`, *optional*):
278
+ The size or type of the instance to be used for hosting the model (e.g. `"x4"`).
279
+ instance_type (`str`, *optional*):
280
+ The cloud instance type where the Inference Endpoint will be deployed (e.g. `"intel-icl"`).
281
+ min_replica (`int`, *optional*):
282
+ The minimum number of replicas (instances) to keep running for the Inference Endpoint.
283
+ max_replica (`int`, *optional*):
284
+ The maximum number of replicas (instances) to scale to for the Inference Endpoint.
285
+ scale_to_zero_timeout (`int`, *optional*):
286
+ The duration in minutes before an inactive endpoint is scaled to zero.
287
+
288
+ repository (`str`, *optional*):
289
+ The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
290
+ framework (`str`, *optional*):
291
+ The machine learning framework used for the model (e.g. `"custom"`).
292
+ revision (`str`, *optional*):
293
+ The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
294
+ task (`str`, *optional*):
295
+ The task on which to deploy the model (e.g. `"text-classification"`).
296
+ custom_image (`Dict`, *optional*):
297
+ A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
298
+ Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
299
+ secrets (`Dict[str, str]`, *optional*):
300
+ Secret values to inject in the container environment.
301
+ Returns:
302
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
303
+ """
304
+ # Make API call
305
+ obj = self._api.update_inference_endpoint(
306
+ name=self.name,
307
+ namespace=self.namespace,
308
+ accelerator=accelerator,
309
+ instance_size=instance_size,
310
+ instance_type=instance_type,
311
+ min_replica=min_replica,
312
+ max_replica=max_replica,
313
+ scale_to_zero_timeout=scale_to_zero_timeout,
314
+ repository=repository,
315
+ framework=framework,
316
+ revision=revision,
317
+ task=task,
318
+ custom_image=custom_image,
319
+ secrets=secrets,
320
+ token=self._token, # type: ignore [arg-type]
321
+ )
322
+
323
+ # Mutate current object
324
+ self.raw = obj.raw
325
+ self._populate_from_raw()
326
+ return self
327
+
328
+ def pause(self) -> "InferenceEndpoint":
329
+ """Pause the Inference Endpoint.
330
+
331
+ A paused Inference Endpoint will not be charged. It can be resumed at any time using [`InferenceEndpoint.resume`].
332
+ This is different than scaling the Inference Endpoint to zero with [`InferenceEndpoint.scale_to_zero`], which
333
+ would be automatically restarted when a request is made to it.
334
+
335
+ This is an alias for [`HfApi.pause_inference_endpoint`]. The current object is mutated in place with the
336
+ latest data from the server.
337
+
338
+ Returns:
339
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
340
+ """
341
+ obj = self._api.pause_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
342
+ self.raw = obj.raw
343
+ self._populate_from_raw()
344
+ return self
345
+
346
+ def resume(self, running_ok: bool = True) -> "InferenceEndpoint":
347
+ """Resume the Inference Endpoint.
348
+
349
+ This is an alias for [`HfApi.resume_inference_endpoint`]. The current object is mutated in place with the
350
+ latest data from the server.
351
+
352
+ Args:
353
+ running_ok (`bool`, *optional*):
354
+ If `True`, the method will not raise an error if the Inference Endpoint is already running. Defaults to
355
+ `True`.
356
+
357
+ Returns:
358
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
359
+ """
360
+ obj = self._api.resume_inference_endpoint(
361
+ name=self.name, namespace=self.namespace, running_ok=running_ok, token=self._token
362
+ ) # type: ignore [arg-type]
363
+ self.raw = obj.raw
364
+ self._populate_from_raw()
365
+ return self
366
+
367
+ def scale_to_zero(self) -> "InferenceEndpoint":
368
+ """Scale Inference Endpoint to zero.
369
+
370
+ An Inference Endpoint scaled to zero will not be charged. It will be resume on the next request to it, with a
371
+ cold start delay. This is different than pausing the Inference Endpoint with [`InferenceEndpoint.pause`], which
372
+ would require a manual resume with [`InferenceEndpoint.resume`].
373
+
374
+ This is an alias for [`HfApi.scale_to_zero_inference_endpoint`]. The current object is mutated in place with the
375
+ latest data from the server.
376
+
377
+ Returns:
378
+ [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
379
+ """
380
+ obj = self._api.scale_to_zero_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
381
+ self.raw = obj.raw
382
+ self._populate_from_raw()
383
+ return self
384
+
385
+ def delete(self) -> None:
386
+ """Delete the Inference Endpoint.
387
+
388
+ This operation is not reversible. If you don't want to be charged for an Inference Endpoint, it is preferable
389
+ to pause it with [`InferenceEndpoint.pause`] or scale it to zero with [`InferenceEndpoint.scale_to_zero`].
390
+
391
+ This is an alias for [`HfApi.delete_inference_endpoint`].
392
+ """
393
+ self._api.delete_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
394
+
395
+ def _populate_from_raw(self) -> None:
396
+ """Populate fields from raw dictionary.
397
+
398
+ Called in __post_init__ + each time the Inference Endpoint is updated.
399
+ """
400
+ # Repr fields
401
+ self.name = self.raw["name"]
402
+ self.repository = self.raw["model"]["repository"]
403
+ self.status = self.raw["status"]["state"]
404
+ self.url = self.raw["status"].get("url")
405
+ self.health_route = self.raw["healthRoute"]
406
+
407
+ # Other fields
408
+ self.framework = self.raw["model"]["framework"]
409
+ self.revision = self.raw["model"]["revision"]
410
+ self.task = self.raw["model"]["task"]
411
+ self.created_at = parse_datetime(self.raw["status"]["createdAt"])
412
+ self.updated_at = parse_datetime(self.raw["status"]["updatedAt"])
413
+ self.type = self.raw["type"]
venv/lib/python3.13/site-packages/huggingface_hub/_jobs_api.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from dataclasses import dataclass
16
+ from datetime import datetime
17
+ from enum import Enum
18
+ from typing import Any, Dict, List, Optional, Union
19
+
20
+ from huggingface_hub import constants
21
+ from huggingface_hub._space_api import SpaceHardware
22
+ from huggingface_hub.utils._datetime import parse_datetime
23
+
24
+
25
+ class JobStage(str, Enum):
26
+ """
27
+ Enumeration of possible stage of a Job on the Hub.
28
+
29
+ Value can be compared to a string:
30
+ ```py
31
+ assert JobStage.COMPLETED == "COMPLETED"
32
+ ```
33
+ Possible values are: `COMPLETED`, `CANCELED`, `ERROR`, `DELETED`, `RUNNING`.
34
+ Taken from https://github.com/huggingface/moon-landing/blob/main/server/job_types/JobInfo.ts#L61 (private url).
35
+ """
36
+
37
+ # Copied from moon-landing > server > lib > Job.ts
38
+ COMPLETED = "COMPLETED"
39
+ CANCELED = "CANCELED"
40
+ ERROR = "ERROR"
41
+ DELETED = "DELETED"
42
+ RUNNING = "RUNNING"
43
+
44
+
45
+ @dataclass
46
+ class JobStatus:
47
+ stage: JobStage
48
+ message: Optional[str]
49
+
50
+
51
+ @dataclass
52
+ class JobOwner:
53
+ id: str
54
+ name: str
55
+ type: str
56
+
57
+
58
+ @dataclass
59
+ class JobInfo:
60
+ """
61
+ Contains information about a Job.
62
+
63
+ Args:
64
+ id (`str`):
65
+ Job ID.
66
+ created_at (`datetime` or `None`):
67
+ When the Job was created.
68
+ docker_image (`str` or `None`):
69
+ The Docker image from Docker Hub used for the Job.
70
+ Can be None if space_id is present instead.
71
+ space_id (`str` or `None`):
72
+ The Docker image from Hugging Face Spaces used for the Job.
73
+ Can be None if docker_image is present instead.
74
+ command (`List[str]` or `None`):
75
+ Command of the Job, e.g. `["python", "-c", "print('hello world')"]`
76
+ arguments (`List[str]` or `None`):
77
+ Arguments passed to the command
78
+ environment (`Dict[str]` or `None`):
79
+ Environment variables of the Job as a dictionary.
80
+ secrets (`Dict[str]` or `None`):
81
+ Secret environment variables of the Job (encrypted).
82
+ flavor (`str` or `None`):
83
+ Flavor for the hardware, as in Hugging Face Spaces. See [`SpaceHardware`] for possible values.
84
+ E.g. `"cpu-basic"`.
85
+ status: (`JobStatus` or `None`):
86
+ Status of the Job, e.g. `JobStatus(stage="RUNNING", message=None)`
87
+ See [`JobStage`] for possible stage values.
88
+ owner: (`JobOwner` or `None`):
89
+ Owner of the Job, e.g. `JobOwner(id="5e9ecfc04957053f60648a3e", name="lhoestq", type="user")`
90
+
91
+ Example:
92
+
93
+ ```python
94
+ >>> from huggingface_hub import run_job
95
+ >>> job = run_job(
96
+ ... image="python:3.12",
97
+ ... command=["python", "-c", "print('Hello from the cloud!')"]
98
+ ... )
99
+ >>> job
100
+ JobInfo(id='687fb701029421ae5549d998', created_at=datetime.datetime(2025, 7, 22, 16, 6, 25, 79000, tzinfo=datetime.timezone.utc), docker_image='python:3.12', space_id=None, command=['python', '-c', "print('Hello from the cloud!')"], arguments=[], environment={}, secrets={}, flavor='cpu-basic', status=JobStatus(stage='RUNNING', message=None), owner=JobOwner(id='5e9ecfc04957053f60648a3e', name='lhoestq', type='user'), endpoint='https://huggingface.co', url='https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998')
101
+ >>> job.id
102
+ '687fb701029421ae5549d998'
103
+ >>> job.url
104
+ 'https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998'
105
+ >>> job.status.stage
106
+ 'RUNNING'
107
+ ```
108
+ """
109
+
110
+ id: str
111
+ created_at: Optional[datetime]
112
+ docker_image: Optional[str]
113
+ space_id: Optional[str]
114
+ command: Optional[List[str]]
115
+ arguments: Optional[List[str]]
116
+ environment: Optional[Dict[str, Any]]
117
+ secrets: Optional[Dict[str, Any]]
118
+ flavor: Optional[SpaceHardware]
119
+ status: JobStatus
120
+ owner: JobOwner
121
+
122
+ # Inferred fields
123
+ endpoint: str
124
+ url: str
125
+
126
+ def __init__(self, **kwargs) -> None:
127
+ self.id = kwargs["id"]
128
+ created_at = kwargs.get("createdAt") or kwargs.get("created_at")
129
+ self.created_at = parse_datetime(created_at) if created_at else None
130
+ self.docker_image = kwargs.get("dockerImage") or kwargs.get("docker_image")
131
+ self.space_id = kwargs.get("spaceId") or kwargs.get("space_id")
132
+ owner = kwargs.get("owner", {})
133
+ self.owner = JobOwner(id=owner["id"], name=owner["name"], type=owner["type"])
134
+ self.command = kwargs.get("command")
135
+ self.arguments = kwargs.get("arguments")
136
+ self.environment = kwargs.get("environment")
137
+ self.secrets = kwargs.get("secrets")
138
+ self.flavor = kwargs.get("flavor")
139
+ status = kwargs.get("status", {})
140
+ self.status = JobStatus(stage=status["stage"], message=status.get("message"))
141
+
142
+ # Inferred fields
143
+ self.endpoint = kwargs.get("endpoint", constants.ENDPOINT)
144
+ self.url = f"{self.endpoint}/jobs/{self.owner.name}/{self.id}"
145
+
146
+
147
+ @dataclass
148
+ class JobSpec:
149
+ docker_image: Optional[str]
150
+ space_id: Optional[str]
151
+ command: Optional[List[str]]
152
+ arguments: Optional[List[str]]
153
+ environment: Optional[Dict[str, Any]]
154
+ secrets: Optional[Dict[str, Any]]
155
+ flavor: Optional[SpaceHardware]
156
+ timeout: Optional[int]
157
+ tags: Optional[List[str]]
158
+ arch: Optional[str]
159
+
160
+ def __init__(self, **kwargs) -> None:
161
+ self.docker_image = kwargs.get("dockerImage") or kwargs.get("docker_image")
162
+ self.space_id = kwargs.get("spaceId") or kwargs.get("space_id")
163
+ self.command = kwargs.get("command")
164
+ self.arguments = kwargs.get("arguments")
165
+ self.environment = kwargs.get("environment")
166
+ self.secrets = kwargs.get("secrets")
167
+ self.flavor = kwargs.get("flavor")
168
+ self.timeout = kwargs.get("timeout")
169
+ self.tags = kwargs.get("tags")
170
+ self.arch = kwargs.get("arch")
171
+
172
+
173
+ @dataclass
174
+ class LastJobInfo:
175
+ id: str
176
+ at: datetime
177
+
178
+ def __init__(self, **kwargs) -> None:
179
+ self.id = kwargs["id"]
180
+ self.at = parse_datetime(kwargs["at"])
181
+
182
+
183
+ @dataclass
184
+ class ScheduledJobStatus:
185
+ last_job: Optional[LastJobInfo]
186
+ next_job_run_at: Optional[datetime]
187
+
188
+ def __init__(self, **kwargs) -> None:
189
+ last_job = kwargs.get("lastJob") or kwargs.get("last_job")
190
+ self.last_job = LastJobInfo(**last_job) if last_job else None
191
+ next_job_run_at = kwargs.get("nextJobRunAt") or kwargs.get("next_job_run_at")
192
+ self.next_job_run_at = parse_datetime(str(next_job_run_at)) if next_job_run_at else None
193
+
194
+
195
+ @dataclass
196
+ class ScheduledJobInfo:
197
+ """
198
+ Contains information about a Job.
199
+
200
+ Args:
201
+ id (`str`):
202
+ Scheduled Job ID.
203
+ created_at (`datetime` or `None`):
204
+ When the scheduled Job was created.
205
+ tags (`List[str]` or `None`):
206
+ The tags of the scheduled Job.
207
+ schedule (`str` or `None`):
208
+ One of "@annually", "@yearly", "@monthly", "@weekly", "@daily", "@hourly", or a
209
+ CRON schedule expression (e.g., '0 9 * * 1' for 9 AM every Monday).
210
+ suspend (`bool` or `None`):
211
+ Whether the scheduled job is suspended (paused).
212
+ concurrency (`bool` or `None`):
213
+ Whether multiple instances of this Job can run concurrently.
214
+ status (`ScheduledJobStatus` or `None`):
215
+ Status of the scheduled Job.
216
+ owner: (`JobOwner` or `None`):
217
+ Owner of the scheduled Job, e.g. `JobOwner(id="5e9ecfc04957053f60648a3e", name="lhoestq", type="user")`
218
+ job_spec: (`JobSpec` or `None`):
219
+ Specifications of the Job.
220
+
221
+ Example:
222
+
223
+ ```python
224
+ >>> from huggingface_hub import run_job
225
+ >>> scheduled_job = create_scheduled_job(
226
+ ... image="python:3.12",
227
+ ... command=["python", "-c", "print('Hello from the cloud!')"],
228
+ ... schedule="@hourly",
229
+ ... )
230
+ >>> scheduled_job.id
231
+ '687fb701029421ae5549d999'
232
+ >>> scheduled_job.status.next_job_run_at
233
+ datetime.datetime(2025, 7, 22, 17, 6, 25, 79000, tzinfo=datetime.timezone.utc)
234
+ ```
235
+ """
236
+
237
+ id: str
238
+ created_at: Optional[datetime]
239
+ job_spec: JobSpec
240
+ schedule: Optional[str]
241
+ suspend: Optional[bool]
242
+ concurrency: Optional[bool]
243
+ status: ScheduledJobStatus
244
+ owner: JobOwner
245
+
246
+ def __init__(self, **kwargs) -> None:
247
+ self.id = kwargs["id"]
248
+ created_at = kwargs.get("createdAt") or kwargs.get("created_at")
249
+ self.created_at = parse_datetime(created_at) if created_at else None
250
+ self.job_spec = JobSpec(**(kwargs.get("job_spec") or kwargs.get("jobSpec", {})))
251
+ self.schedule = kwargs.get("schedule")
252
+ self.suspend = kwargs.get("suspend")
253
+ self.concurrency = kwargs.get("concurrency")
254
+ status = kwargs.get("status", {})
255
+ self.status = ScheduledJobStatus(
256
+ last_job=status.get("last_job") or status.get("lastJob"),
257
+ next_job_run_at=status.get("next_job_run_at") or status.get("nextJobRunAt"),
258
+ )
259
+ owner = kwargs.get("owner", {})
260
+ self.owner = JobOwner(id=owner["id"], name=owner["name"], type=owner["type"])
261
+
262
+
263
+ def _create_job_spec(
264
+ *,
265
+ image: str,
266
+ command: List[str],
267
+ env: Optional[Dict[str, Any]],
268
+ secrets: Optional[Dict[str, Any]],
269
+ flavor: Optional[SpaceHardware],
270
+ timeout: Optional[Union[int, float, str]],
271
+ ) -> Dict[str, Any]:
272
+ # prepare job spec to send to HF Jobs API
273
+ job_spec: Dict[str, Any] = {
274
+ "command": command,
275
+ "arguments": [],
276
+ "environment": env or {},
277
+ "flavor": flavor or SpaceHardware.CPU_BASIC,
278
+ }
279
+ # secrets are optional
280
+ if secrets:
281
+ job_spec["secrets"] = secrets
282
+ # timeout is optional
283
+ if timeout:
284
+ time_units_factors = {"s": 1, "m": 60, "h": 3600, "d": 3600 * 24}
285
+ if isinstance(timeout, str) and timeout[-1] in time_units_factors:
286
+ job_spec["timeoutSeconds"] = int(float(timeout[:-1]) * time_units_factors[timeout[-1]])
287
+ else:
288
+ job_spec["timeoutSeconds"] = int(timeout)
289
+ # input is either from docker hub or from HF spaces
290
+ for prefix in (
291
+ "https://huggingface.co/spaces/",
292
+ "https://hf.co/spaces/",
293
+ "huggingface.co/spaces/",
294
+ "hf.co/spaces/",
295
+ ):
296
+ if image.startswith(prefix):
297
+ job_spec["spaceId"] = image[len(prefix) :]
298
+ break
299
+ else:
300
+ job_spec["dockerImage"] = image
301
+ return job_spec
venv/lib/python3.13/site-packages/huggingface_hub/_local_folder.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Contains utilities to handle the `../.cache/huggingface` folder in local directories.
16
+
17
+ First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
18
+ download metadata when downloading files from the hub to a local directory (without
19
+ using the cache).
20
+
21
+ ./.cache/huggingface folder structure:
22
+ [4.0K] data
23
+ ├── [4.0K] .cache
24
+ │ └── [4.0K] huggingface
25
+ │ └── [4.0K] download
26
+ │ ├── [ 16] file.parquet.metadata
27
+ │ ├── [ 16] file.txt.metadata
28
+ │ └── [4.0K] folder
29
+ │ └── [ 16] file.parquet.metadata
30
+
31
+ ├── [6.5G] file.parquet
32
+ ├── [1.5K] file.txt
33
+ └── [4.0K] folder
34
+ └── [ 16] file.parquet
35
+
36
+
37
+ Download metadata file structure:
38
+ ```
39
+ # file.txt.metadata
40
+ 11c5a3d5811f50298f278a704980280950aedb10
41
+ a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
42
+ 1712656091.123
43
+
44
+ # file.parquet.metadata
45
+ 11c5a3d5811f50298f278a704980280950aedb10
46
+ 7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
47
+ 1712656091.123
48
+ }
49
+ ```
50
+ """
51
+
52
+ import base64
53
+ import hashlib
54
+ import logging
55
+ import os
56
+ import time
57
+ from dataclasses import dataclass
58
+ from pathlib import Path
59
+ from typing import Optional
60
+
61
+ from .utils import WeakFileLock
62
+
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+
67
+ @dataclass
68
+ class LocalDownloadFilePaths:
69
+ """
70
+ Paths to the files related to a download process in a local dir.
71
+
72
+ Returned by [`get_local_download_paths`].
73
+
74
+ Attributes:
75
+ file_path (`Path`):
76
+ Path where the file will be saved.
77
+ lock_path (`Path`):
78
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
79
+ metadata_path (`Path`):
80
+ Path to the metadata file.
81
+ """
82
+
83
+ file_path: Path
84
+ lock_path: Path
85
+ metadata_path: Path
86
+
87
+ def incomplete_path(self, etag: str) -> Path:
88
+ """Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
89
+ path = self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
90
+ resolved_path = str(path.resolve())
91
+ # Some Windows versions do not allow for paths longer than 255 characters.
92
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix.
93
+ if os.name == "nt" and len(resolved_path) > 255 and not resolved_path.startswith("\\\\?\\"):
94
+ path = Path("\\\\?\\" + resolved_path)
95
+ return path
96
+
97
+
98
+ @dataclass(frozen=True)
99
+ class LocalUploadFilePaths:
100
+ """
101
+ Paths to the files related to an upload process in a local dir.
102
+
103
+ Returned by [`get_local_upload_paths`].
104
+
105
+ Attributes:
106
+ path_in_repo (`str`):
107
+ Path of the file in the repo.
108
+ file_path (`Path`):
109
+ Path where the file will be saved.
110
+ lock_path (`Path`):
111
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
112
+ metadata_path (`Path`):
113
+ Path to the metadata file.
114
+ """
115
+
116
+ path_in_repo: str
117
+ file_path: Path
118
+ lock_path: Path
119
+ metadata_path: Path
120
+
121
+
122
+ @dataclass
123
+ class LocalDownloadFileMetadata:
124
+ """
125
+ Metadata about a file in the local directory related to a download process.
126
+
127
+ Attributes:
128
+ filename (`str`):
129
+ Path of the file in the repo.
130
+ commit_hash (`str`):
131
+ Commit hash of the file in the repo.
132
+ etag (`str`):
133
+ ETag of the file in the repo. Used to check if the file has changed.
134
+ For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
135
+ timestamp (`int`):
136
+ Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
137
+ """
138
+
139
+ filename: str
140
+ commit_hash: str
141
+ etag: str
142
+ timestamp: float
143
+
144
+
145
+ @dataclass
146
+ class LocalUploadFileMetadata:
147
+ """
148
+ Metadata about a file in the local directory related to an upload process.
149
+ """
150
+
151
+ size: int
152
+
153
+ # Default values correspond to "we don't know yet"
154
+ timestamp: Optional[float] = None
155
+ should_ignore: Optional[bool] = None
156
+ sha256: Optional[str] = None
157
+ upload_mode: Optional[str] = None
158
+ remote_oid: Optional[str] = None
159
+ is_uploaded: bool = False
160
+ is_committed: bool = False
161
+
162
+ def save(self, paths: LocalUploadFilePaths) -> None:
163
+ """Save the metadata to disk."""
164
+ with WeakFileLock(paths.lock_path):
165
+ with paths.metadata_path.open("w") as f:
166
+ new_timestamp = time.time()
167
+ f.write(str(new_timestamp) + "\n")
168
+
169
+ f.write(str(self.size)) # never None
170
+ f.write("\n")
171
+
172
+ if self.should_ignore is not None:
173
+ f.write(str(int(self.should_ignore)))
174
+ f.write("\n")
175
+
176
+ if self.sha256 is not None:
177
+ f.write(self.sha256)
178
+ f.write("\n")
179
+
180
+ if self.upload_mode is not None:
181
+ f.write(self.upload_mode)
182
+ f.write("\n")
183
+
184
+ if self.remote_oid is not None:
185
+ f.write(self.remote_oid)
186
+ f.write("\n")
187
+
188
+ f.write(str(int(self.is_uploaded)) + "\n")
189
+ f.write(str(int(self.is_committed)) + "\n")
190
+
191
+ self.timestamp = new_timestamp
192
+
193
+
194
+ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
195
+ """Compute paths to the files related to a download process.
196
+
197
+ Folders containing the paths are all guaranteed to exist.
198
+
199
+ Args:
200
+ local_dir (`Path`):
201
+ Path to the local directory in which files are downloaded.
202
+ filename (`str`):
203
+ Path of the file in the repo.
204
+
205
+ Return:
206
+ [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
207
+ """
208
+ # filename is the path in the Hub repository (separated by '/')
209
+ # make sure to have a cross platform transcription
210
+ sanitized_filename = os.path.join(*filename.split("/"))
211
+ if os.name == "nt":
212
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
213
+ raise ValueError(
214
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
215
+ " owner to rename this file."
216
+ )
217
+ file_path = local_dir / sanitized_filename
218
+ metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
219
+ lock_path = metadata_path.with_suffix(".lock")
220
+
221
+ # Some Windows versions do not allow for paths longer than 255 characters.
222
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
223
+ if os.name == "nt":
224
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
225
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
226
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
227
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
228
+
229
+ file_path.parent.mkdir(parents=True, exist_ok=True)
230
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
231
+ return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
232
+
233
+
234
+ def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
235
+ """Compute paths to the files related to an upload process.
236
+
237
+ Folders containing the paths are all guaranteed to exist.
238
+
239
+ Args:
240
+ local_dir (`Path`):
241
+ Path to the local directory that is uploaded.
242
+ filename (`str`):
243
+ Path of the file in the repo.
244
+
245
+ Return:
246
+ [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
247
+ """
248
+ # filename is the path in the Hub repository (separated by '/')
249
+ # make sure to have a cross platform transcription
250
+ sanitized_filename = os.path.join(*filename.split("/"))
251
+ if os.name == "nt":
252
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
253
+ raise ValueError(
254
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
255
+ " owner to rename this file."
256
+ )
257
+ file_path = local_dir / sanitized_filename
258
+ metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
259
+ lock_path = metadata_path.with_suffix(".lock")
260
+
261
+ # Some Windows versions do not allow for paths longer than 255 characters.
262
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
263
+ if os.name == "nt":
264
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
265
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
266
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
267
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
268
+
269
+ file_path.parent.mkdir(parents=True, exist_ok=True)
270
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
271
+ return LocalUploadFilePaths(
272
+ path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
273
+ )
274
+
275
+
276
+ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
277
+ """Read metadata about a file in the local directory related to a download process.
278
+
279
+ Args:
280
+ local_dir (`Path`):
281
+ Path to the local directory in which files are downloaded.
282
+ filename (`str`):
283
+ Path of the file in the repo.
284
+
285
+ Return:
286
+ `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
287
+ """
288
+ paths = get_local_download_paths(local_dir, filename)
289
+ with WeakFileLock(paths.lock_path):
290
+ if paths.metadata_path.exists():
291
+ try:
292
+ with paths.metadata_path.open() as f:
293
+ commit_hash = f.readline().strip()
294
+ etag = f.readline().strip()
295
+ timestamp = float(f.readline().strip())
296
+ metadata = LocalDownloadFileMetadata(
297
+ filename=filename,
298
+ commit_hash=commit_hash,
299
+ etag=etag,
300
+ timestamp=timestamp,
301
+ )
302
+ except Exception as e:
303
+ # remove the metadata file if it is corrupted / not the right format
304
+ logger.warning(
305
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
306
+ )
307
+ try:
308
+ paths.metadata_path.unlink()
309
+ except Exception as e:
310
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
311
+
312
+ try:
313
+ # check if the file exists and hasn't been modified since the metadata was saved
314
+ stat = paths.file_path.stat()
315
+ if (
316
+ stat.st_mtime - 1 <= metadata.timestamp
317
+ ): # allow 1s difference as stat.st_mtime might not be precise
318
+ return metadata
319
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
320
+ except FileNotFoundError:
321
+ # file does not exist => metadata is outdated
322
+ return None
323
+ return None
324
+
325
+
326
+ def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
327
+ """Read metadata about a file in the local directory related to an upload process.
328
+
329
+ TODO: factorize logic with `read_download_metadata`.
330
+
331
+ Args:
332
+ local_dir (`Path`):
333
+ Path to the local directory in which files are downloaded.
334
+ filename (`str`):
335
+ Path of the file in the repo.
336
+
337
+ Return:
338
+ `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
339
+ """
340
+ paths = get_local_upload_paths(local_dir, filename)
341
+ with WeakFileLock(paths.lock_path):
342
+ if paths.metadata_path.exists():
343
+ try:
344
+ with paths.metadata_path.open() as f:
345
+ timestamp = float(f.readline().strip())
346
+
347
+ size = int(f.readline().strip()) # never None
348
+
349
+ _should_ignore = f.readline().strip()
350
+ should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
351
+
352
+ _sha256 = f.readline().strip()
353
+ sha256 = None if _sha256 == "" else _sha256
354
+
355
+ _upload_mode = f.readline().strip()
356
+ upload_mode = None if _upload_mode == "" else _upload_mode
357
+ if upload_mode not in (None, "regular", "lfs"):
358
+ raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
359
+
360
+ _remote_oid = f.readline().strip()
361
+ remote_oid = None if _remote_oid == "" else _remote_oid
362
+
363
+ is_uploaded = bool(int(f.readline().strip()))
364
+ is_committed = bool(int(f.readline().strip()))
365
+
366
+ metadata = LocalUploadFileMetadata(
367
+ timestamp=timestamp,
368
+ size=size,
369
+ should_ignore=should_ignore,
370
+ sha256=sha256,
371
+ upload_mode=upload_mode,
372
+ remote_oid=remote_oid,
373
+ is_uploaded=is_uploaded,
374
+ is_committed=is_committed,
375
+ )
376
+ except Exception as e:
377
+ # remove the metadata file if it is corrupted / not the right format
378
+ logger.warning(
379
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
380
+ )
381
+ try:
382
+ paths.metadata_path.unlink()
383
+ except Exception as e:
384
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
385
+
386
+ # TODO: can we do better?
387
+ if (
388
+ metadata.timestamp is not None
389
+ and metadata.is_uploaded # file was uploaded
390
+ and not metadata.is_committed # but not committed
391
+ and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
392
+ ): # => we consider it as garbage-collected by S3
393
+ metadata.is_uploaded = False
394
+
395
+ # check if the file exists and hasn't been modified since the metadata was saved
396
+ try:
397
+ if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
398
+ return metadata
399
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
400
+ except FileNotFoundError:
401
+ # file does not exist => metadata is outdated
402
+ pass
403
+
404
+ # empty metadata => we don't know anything expect its size
405
+ return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
406
+
407
+
408
+ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
409
+ """Write metadata about a file in the local directory related to a download process.
410
+
411
+ Args:
412
+ local_dir (`Path`):
413
+ Path to the local directory in which files are downloaded.
414
+ """
415
+ paths = get_local_download_paths(local_dir, filename)
416
+ with WeakFileLock(paths.lock_path):
417
+ with paths.metadata_path.open("w") as f:
418
+ f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
419
+
420
+
421
+ def _huggingface_dir(local_dir: Path) -> Path:
422
+ """Return the path to the `.cache/huggingface` directory in a local directory."""
423
+ # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
424
+ path = local_dir / ".cache" / "huggingface"
425
+ path.mkdir(exist_ok=True, parents=True)
426
+
427
+ # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
428
+ # Should be thread-safe enough like this.
429
+ gitignore = path / ".gitignore"
430
+ gitignore_lock = path / ".gitignore.lock"
431
+ if not gitignore.exists():
432
+ try:
433
+ with WeakFileLock(gitignore_lock, timeout=0.1):
434
+ gitignore.write_text("*")
435
+ except IndexError:
436
+ pass
437
+ except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
438
+ pass
439
+ try:
440
+ gitignore_lock.unlink()
441
+ except OSError:
442
+ pass
443
+ return path
444
+
445
+
446
+ def _short_hash(filename: str) -> str:
447
+ return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()
venv/lib/python3.13/site-packages/huggingface_hub/_login.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains methods to log in to the Hub."""
15
+
16
+ import os
17
+ import subprocess
18
+ from getpass import getpass
19
+ from pathlib import Path
20
+ from typing import Optional
21
+
22
+ from . import constants
23
+ from .commands._cli_utils import ANSI
24
+ from .utils import (
25
+ capture_output,
26
+ get_token,
27
+ is_google_colab,
28
+ is_notebook,
29
+ list_credential_helpers,
30
+ logging,
31
+ run_subprocess,
32
+ set_git_credential,
33
+ unset_git_credential,
34
+ )
35
+ from .utils._auth import (
36
+ _get_token_by_name,
37
+ _get_token_from_environment,
38
+ _get_token_from_file,
39
+ _get_token_from_google_colab,
40
+ _save_stored_tokens,
41
+ _save_token,
42
+ get_stored_tokens,
43
+ )
44
+ from .utils._deprecation import _deprecate_arguments, _deprecate_positional_args
45
+
46
+
47
+ logger = logging.get_logger(__name__)
48
+
49
+ _HF_LOGO_ASCII = """
50
+ _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
51
+ _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
52
+ _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
53
+ _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
54
+ _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
55
+ """
56
+
57
+
58
+ @_deprecate_arguments(
59
+ version="1.0",
60
+ deprecated_args="write_permission",
61
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
62
+ )
63
+ @_deprecate_positional_args(version="1.0")
64
+ def login(
65
+ token: Optional[str] = None,
66
+ *,
67
+ add_to_git_credential: bool = False,
68
+ new_session: bool = True,
69
+ write_permission: bool = False,
70
+ ) -> None:
71
+ """Login the machine to access the Hub.
72
+
73
+ The `token` is persisted in cache and set as a git credential. Once done, the machine
74
+ is logged in and the access token will be available across all `huggingface_hub`
75
+ components. If `token` is not provided, it will be prompted to the user either with
76
+ a widget (in a notebook) or via the terminal.
77
+
78
+ To log in from outside of a script, one can also use `hf auth login` which is
79
+ a cli command that wraps [`login`].
80
+
81
+ > [!TIP]
82
+ > [`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and
83
+ > extends its capabilities.
84
+
85
+ > [!TIP]
86
+ > When the token is not passed, [`login`] will automatically detect if the script runs
87
+ > in a notebook or not. However, this detection might not be accurate due to the
88
+ > variety of notebooks that exists nowadays. If that is the case, you can always force
89
+ > the UI by using [`notebook_login`] or [`interpreter_login`].
90
+
91
+ Args:
92
+ token (`str`, *optional*):
93
+ User access token to generate from https://huggingface.co/settings/token.
94
+ add_to_git_credential (`bool`, defaults to `False`):
95
+ If `True`, token will be set as git credential. If no git credential helper
96
+ is configured, a warning will be displayed to the user. If `token` is `None`,
97
+ the value of `add_to_git_credential` is ignored and will be prompted again
98
+ to the end user.
99
+ new_session (`bool`, defaults to `True`):
100
+ If `True`, will request a token even if one is already saved on the machine.
101
+ write_permission (`bool`):
102
+ Ignored and deprecated argument.
103
+ Raises:
104
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
105
+ If an organization token is passed. Only personal account tokens are valid
106
+ to log in.
107
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
108
+ If token is invalid.
109
+ [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
110
+ If running in a notebook but `ipywidgets` is not installed.
111
+ """
112
+ if token is not None:
113
+ if not add_to_git_credential:
114
+ logger.info(
115
+ "The token has not been saved to the git credentials helper. Pass "
116
+ "`add_to_git_credential=True` in this function directly or "
117
+ "`--add-to-git-credential` if using via `hf`CLI if "
118
+ "you want to set the git credential as well."
119
+ )
120
+ _login(token, add_to_git_credential=add_to_git_credential)
121
+ elif is_notebook():
122
+ notebook_login(new_session=new_session)
123
+ else:
124
+ interpreter_login(new_session=new_session)
125
+
126
+
127
+ def logout(token_name: Optional[str] = None) -> None:
128
+ """Logout the machine from the Hub.
129
+
130
+ Token is deleted from the machine and removed from git credential.
131
+
132
+ Args:
133
+ token_name (`str`, *optional*):
134
+ Name of the access token to logout from. If `None`, will logout from all saved access tokens.
135
+ Raises:
136
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
137
+ If the access token name is not found.
138
+ """
139
+ if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens
140
+ logger.warning("Not logged in!")
141
+ return
142
+ if not token_name:
143
+ # Delete all saved access tokens and token
144
+ for file_path in (constants.HF_TOKEN_PATH, constants.HF_STORED_TOKENS_PATH):
145
+ try:
146
+ Path(file_path).unlink()
147
+ except FileNotFoundError:
148
+ pass
149
+ logger.info("Successfully logged out from all access tokens.")
150
+ else:
151
+ _logout_from_token(token_name)
152
+ logger.info(f"Successfully logged out from access token: {token_name}.")
153
+
154
+ unset_git_credential()
155
+
156
+ # Check if still logged in
157
+ if _get_token_from_google_colab() is not None:
158
+ raise EnvironmentError(
159
+ "You are automatically logged in using a Google Colab secret.\n"
160
+ "To log out, you must unset the `HF_TOKEN` secret in your Colab settings."
161
+ )
162
+ if _get_token_from_environment() is not None:
163
+ raise EnvironmentError(
164
+ "Token has been deleted from your machine but you are still logged in.\n"
165
+ "To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables."
166
+ )
167
+
168
+
169
+ def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None:
170
+ """Switch to a different access token.
171
+
172
+ Args:
173
+ token_name (`str`):
174
+ Name of the access token to switch to.
175
+ add_to_git_credential (`bool`, defaults to `False`):
176
+ If `True`, token will be set as git credential. If no git credential helper
177
+ is configured, a warning will be displayed to the user. If `token` is `None`,
178
+ the value of `add_to_git_credential` is ignored and will be prompted again
179
+ to the end user.
180
+
181
+ Raises:
182
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
183
+ If the access token name is not found.
184
+ """
185
+ token = _get_token_by_name(token_name)
186
+ if not token:
187
+ raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
188
+ # Write token to HF_TOKEN_PATH
189
+ _set_active_token(token_name, add_to_git_credential)
190
+ logger.info(f"The current active token is: {token_name}")
191
+ token_from_environment = _get_token_from_environment()
192
+ if token_from_environment is not None and token_from_environment != token:
193
+ logger.warning(
194
+ "The environment variable `HF_TOKEN` is set and will override the access token you've just switched to."
195
+ )
196
+
197
+
198
+ def auth_list() -> None:
199
+ """List all stored access tokens."""
200
+ tokens = get_stored_tokens()
201
+
202
+ if not tokens:
203
+ logger.info("No access tokens found.")
204
+ return
205
+ # Find current token
206
+ current_token = get_token()
207
+ current_token_name = None
208
+ for token_name in tokens:
209
+ if tokens.get(token_name) == current_token:
210
+ current_token_name = token_name
211
+ # Print header
212
+ max_offset = max(len("token"), max(len(token) for token in tokens)) + 2
213
+ print(f" {{:<{max_offset}}}| {{:<15}}".format("name", "token"))
214
+ print("-" * (max_offset + 2) + "|" + "-" * 15)
215
+
216
+ # Print saved access tokens
217
+ for token_name in tokens:
218
+ token = tokens.get(token_name, "<not set>")
219
+ masked_token = f"{token[:3]}****{token[-4:]}" if token != "<not set>" else token
220
+ is_current = "*" if token == current_token else " "
221
+
222
+ print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token))
223
+
224
+ if _get_token_from_environment():
225
+ logger.warning(
226
+ "\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above."
227
+ )
228
+ elif current_token_name is None:
229
+ logger.warning(
230
+ "\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `hf auth login` to log in."
231
+ )
232
+
233
+
234
+ ###
235
+ # Interpreter-based login (text)
236
+ ###
237
+
238
+
239
+ @_deprecate_arguments(
240
+ version="1.0",
241
+ deprecated_args="write_permission",
242
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
243
+ )
244
+ @_deprecate_positional_args(version="1.0")
245
+ def interpreter_login(*, new_session: bool = True, write_permission: bool = False) -> None:
246
+ """
247
+ Displays a prompt to log in to the HF website and store the token.
248
+
249
+ This is equivalent to [`login`] without passing a token when not run in a notebook.
250
+ [`interpreter_login`] is useful if you want to force the use of the terminal prompt
251
+ instead of a notebook widget.
252
+
253
+ For more details, see [`login`].
254
+
255
+ Args:
256
+ new_session (`bool`, defaults to `True`):
257
+ If `True`, will request a token even if one is already saved on the machine.
258
+ write_permission (`bool`):
259
+ Ignored and deprecated argument.
260
+ """
261
+ if not new_session and get_token() is not None:
262
+ logger.info("User is already logged in.")
263
+ return
264
+
265
+ from .commands.delete_cache import _ask_for_confirmation_no_tui
266
+
267
+ print(_HF_LOGO_ASCII)
268
+ if get_token() is not None:
269
+ logger.info(
270
+ " A token is already saved on your machine. Run `hf auth whoami`"
271
+ " to get more information or `hf auth logout` if you want"
272
+ " to log out."
273
+ )
274
+ logger.info(" Setting a new token will erase the existing one.")
275
+
276
+ logger.info(
277
+ " To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ."
278
+ )
279
+ if os.name == "nt":
280
+ logger.info("Token can be pasted using 'Right-Click'.")
281
+ token = getpass("Enter your token (input will not be visible): ")
282
+ add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?")
283
+
284
+ _login(token=token, add_to_git_credential=add_to_git_credential)
285
+
286
+
287
+ ###
288
+ # Notebook-based login (widget)
289
+ ###
290
+
291
+ NOTEBOOK_LOGIN_PASSWORD_HTML = """<center> <img
292
+ src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
293
+ alt='Hugging Face'> <br> Immediately click login after typing your password or
294
+ it might be stored in plain text in this notebook file. </center>"""
295
+
296
+
297
+ NOTEBOOK_LOGIN_TOKEN_HTML_START = """<center> <img
298
+ src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
299
+ alt='Hugging Face'> <br> Copy a token from <a
300
+ href="https://huggingface.co/settings/tokens" target="_blank">your Hugging Face
301
+ tokens page</a> and paste it below. <br> Immediately click login after copying
302
+ your token or it might be stored in plain text in this notebook file. </center>"""
303
+
304
+
305
+ NOTEBOOK_LOGIN_TOKEN_HTML_END = """
306
+ <b>Pro Tip:</b> If you don't already have one, you can create a dedicated
307
+ 'notebooks' token with 'write' access, that you can then easily reuse for all
308
+ notebooks. </center>"""
309
+
310
+
311
+ @_deprecate_arguments(
312
+ version="1.0",
313
+ deprecated_args="write_permission",
314
+ custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
315
+ )
316
+ @_deprecate_positional_args(version="1.0")
317
+ def notebook_login(*, new_session: bool = True, write_permission: bool = False) -> None:
318
+ """
319
+ Displays a widget to log in to the HF website and store the token.
320
+
321
+ This is equivalent to [`login`] without passing a token when run in a notebook.
322
+ [`notebook_login`] is useful if you want to force the use of the notebook widget
323
+ instead of a prompt in the terminal.
324
+
325
+ For more details, see [`login`].
326
+
327
+ Args:
328
+ new_session (`bool`, defaults to `True`):
329
+ If `True`, will request a token even if one is already saved on the machine.
330
+ write_permission (`bool`):
331
+ Ignored and deprecated argument.
332
+ """
333
+ try:
334
+ import ipywidgets.widgets as widgets # type: ignore
335
+ from IPython.display import display # type: ignore
336
+ except ImportError:
337
+ raise ImportError(
338
+ "The `notebook_login` function can only be used in a notebook (Jupyter or"
339
+ " Colab) and you need the `ipywidgets` module: `pip install ipywidgets`."
340
+ )
341
+ if not new_session and get_token() is not None:
342
+ logger.info("User is already logged in.")
343
+ return
344
+
345
+ box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%")
346
+
347
+ token_widget = widgets.Password(description="Token:")
348
+ git_checkbox_widget = widgets.Checkbox(value=True, description="Add token as git credential?")
349
+ token_finish_button = widgets.Button(description="Login")
350
+
351
+ login_token_widget = widgets.VBox(
352
+ [
353
+ widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_START),
354
+ token_widget,
355
+ git_checkbox_widget,
356
+ token_finish_button,
357
+ widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_END),
358
+ ],
359
+ layout=box_layout,
360
+ )
361
+ display(login_token_widget)
362
+
363
+ # On click events
364
+ def login_token_event(t):
365
+ """Event handler for the login button."""
366
+ token = token_widget.value
367
+ add_to_git_credential = git_checkbox_widget.value
368
+ # Erase token and clear value to make sure it's not saved in the notebook.
369
+ token_widget.value = ""
370
+ # Hide inputs
371
+ login_token_widget.children = [widgets.Label("Connecting...")]
372
+ try:
373
+ with capture_output() as captured:
374
+ _login(token, add_to_git_credential=add_to_git_credential)
375
+ message = captured.getvalue()
376
+ except Exception as error:
377
+ message = str(error)
378
+ # Print result (success message or error)
379
+ login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()]
380
+
381
+ token_finish_button.on_click(login_token_event)
382
+
383
+
384
+ ###
385
+ # Login private helpers
386
+ ###
387
+
388
+
389
+ def _login(
390
+ token: str,
391
+ add_to_git_credential: bool,
392
+ ) -> None:
393
+ from .hf_api import whoami # avoid circular import
394
+
395
+ if token.startswith("api_org"):
396
+ raise ValueError("You must use your personal account token, not an organization token.")
397
+
398
+ token_info = whoami(token)
399
+ permission = token_info["auth"]["accessToken"]["role"]
400
+ logger.info(f"Token is valid (permission: {permission}).")
401
+
402
+ token_name = token_info["auth"]["accessToken"]["displayName"]
403
+ # Store token locally
404
+ _save_token(token=token, token_name=token_name)
405
+ # Set active token
406
+ _set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential)
407
+ logger.info("Login successful.")
408
+ if _get_token_from_environment():
409
+ logger.warning(
410
+ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured."
411
+ )
412
+ else:
413
+ logger.info(f"The current active token is: `{token_name}`")
414
+
415
+
416
+ def _logout_from_token(token_name: str) -> None:
417
+ """Logout from a specific access token.
418
+
419
+ Args:
420
+ token_name (`str`):
421
+ The name of the access token to logout from.
422
+ Raises:
423
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
424
+ If the access token name is not found.
425
+ """
426
+ stored_tokens = get_stored_tokens()
427
+ # If there is no access tokens saved or the access token name is not found, do nothing
428
+ if not stored_tokens or token_name not in stored_tokens:
429
+ return
430
+
431
+ token = stored_tokens.pop(token_name)
432
+ _save_stored_tokens(stored_tokens)
433
+
434
+ if token == _get_token_from_file():
435
+ logger.warning(f"Active token '{token_name}' has been deleted.")
436
+ Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True)
437
+
438
+
439
+ def _set_active_token(
440
+ token_name: str,
441
+ add_to_git_credential: bool,
442
+ ) -> None:
443
+ """Set the active access token.
444
+
445
+ Args:
446
+ token_name (`str`):
447
+ The name of the token to set as active.
448
+ """
449
+ token = _get_token_by_name(token_name)
450
+ if not token:
451
+ raise ValueError(f"Token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
452
+ if add_to_git_credential:
453
+ if _is_git_credential_helper_configured():
454
+ set_git_credential(token)
455
+ logger.info(
456
+ "Your token has been saved in your configured git credential helpers"
457
+ + f" ({','.join(list_credential_helpers())})."
458
+ )
459
+ else:
460
+ logger.warning("Token has not been saved to git credential helper.")
461
+ # Write token to HF_TOKEN_PATH
462
+ path = Path(constants.HF_TOKEN_PATH)
463
+ path.parent.mkdir(parents=True, exist_ok=True)
464
+ path.write_text(token)
465
+ logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}")
466
+
467
+
468
+ def _is_git_credential_helper_configured() -> bool:
469
+ """Check if a git credential helper is configured.
470
+
471
+ Warns user if not the case (except for Google Colab where "store" is set by default
472
+ by `huggingface_hub`).
473
+ """
474
+ helpers = list_credential_helpers()
475
+ if len(helpers) > 0:
476
+ return True # Do not warn: at least 1 helper is set
477
+
478
+ # Only in Google Colab to avoid the warning message
479
+ # See https://github.com/huggingface/huggingface_hub/issues/1043#issuecomment-1247010710
480
+ if is_google_colab():
481
+ _set_store_as_git_credential_helper_globally()
482
+ return True # Do not warn: "store" is used by default in Google Colab
483
+
484
+ # Otherwise, warn user
485
+ print(
486
+ ANSI.red(
487
+ "Cannot authenticate through git-credential as no helper is defined on your"
488
+ " machine.\nYou might have to re-authenticate when pushing to the Hugging"
489
+ " Face Hub.\nRun the following command in your terminal in case you want to"
490
+ " set the 'store' credential helper as default.\n\ngit config --global"
491
+ " credential.helper store\n\nRead"
492
+ " https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more"
493
+ " details."
494
+ )
495
+ )
496
+ return False
497
+
498
+
499
+ def _set_store_as_git_credential_helper_globally() -> None:
500
+ """Set globally the credential.helper to `store`.
501
+
502
+ To be used only in Google Colab as we assume the user doesn't care about the git
503
+ credential config. It is the only particular case where we don't want to display the
504
+ warning message in [`notebook_login()`].
505
+
506
+ Related:
507
+ - https://github.com/huggingface/huggingface_hub/issues/1043
508
+ - https://github.com/huggingface/huggingface_hub/issues/1051
509
+ - https://git-scm.com/docs/git-credential-store
510
+ """
511
+ try:
512
+ run_subprocess("git config --global credential.helper store")
513
+ except subprocess.CalledProcessError as exc:
514
+ raise EnvironmentError(exc.stderr)
venv/lib/python3.13/site-packages/huggingface_hub/_oauth.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import hashlib
3
+ import logging
4
+ import os
5
+ import time
6
+ import urllib.parse
7
+ import warnings
8
+ from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
10
+
11
+ from . import constants
12
+ from .hf_api import whoami
13
+ from .utils import experimental, get_token
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ if TYPE_CHECKING:
19
+ import fastapi
20
+
21
+
22
+ @dataclass
23
+ class OAuthOrgInfo:
24
+ """
25
+ Information about an organization linked to a user logged in with OAuth.
26
+
27
+ Attributes:
28
+ sub (`str`):
29
+ Unique identifier for the org. OpenID Connect field.
30
+ name (`str`):
31
+ The org's full name. OpenID Connect field.
32
+ preferred_username (`str`):
33
+ The org's username. OpenID Connect field.
34
+ picture (`str`):
35
+ The org's profile picture URL. OpenID Connect field.
36
+ is_enterprise (`bool`):
37
+ Whether the org is an enterprise org. Hugging Face field.
38
+ can_pay (`Optional[bool]`, *optional*):
39
+ Whether the org has a payment method set up. Hugging Face field.
40
+ role_in_org (`Optional[str]`, *optional*):
41
+ The user's role in the org. Hugging Face field.
42
+ security_restrictions (`Optional[List[Literal["ip", "token-policy", "mfa", "sso"]]]`, *optional*):
43
+ Array of security restrictions that the user hasn't completed for this org. Possible values: "ip", "token-policy", "mfa", "sso". Hugging Face field.
44
+ """
45
+
46
+ sub: str
47
+ name: str
48
+ preferred_username: str
49
+ picture: str
50
+ is_enterprise: bool
51
+ can_pay: Optional[bool] = None
52
+ role_in_org: Optional[str] = None
53
+ security_restrictions: Optional[List[Literal["ip", "token-policy", "mfa", "sso"]]] = None
54
+
55
+
56
+ @dataclass
57
+ class OAuthUserInfo:
58
+ """
59
+ Information about a user logged in with OAuth.
60
+
61
+ Attributes:
62
+ sub (`str`):
63
+ Unique identifier for the user, even in case of rename. OpenID Connect field.
64
+ name (`str`):
65
+ The user's full name. OpenID Connect field.
66
+ preferred_username (`str`):
67
+ The user's username. OpenID Connect field.
68
+ email_verified (`Optional[bool]`, *optional*):
69
+ Indicates if the user's email is verified. OpenID Connect field.
70
+ email (`Optional[str]`, *optional*):
71
+ The user's email address. OpenID Connect field.
72
+ picture (`str`):
73
+ The user's profile picture URL. OpenID Connect field.
74
+ profile (`str`):
75
+ The user's profile URL. OpenID Connect field.
76
+ website (`Optional[str]`, *optional*):
77
+ The user's website URL. OpenID Connect field.
78
+ is_pro (`bool`):
79
+ Whether the user is a pro user. Hugging Face field.
80
+ can_pay (`Optional[bool]`, *optional*):
81
+ Whether the user has a payment method set up. Hugging Face field.
82
+ orgs (`Optional[List[OrgInfo]]`, *optional*):
83
+ List of organizations the user is part of. Hugging Face field.
84
+ """
85
+
86
+ sub: str
87
+ name: str
88
+ preferred_username: str
89
+ email_verified: Optional[bool]
90
+ email: Optional[str]
91
+ picture: str
92
+ profile: str
93
+ website: Optional[str]
94
+ is_pro: bool
95
+ can_pay: Optional[bool]
96
+ orgs: Optional[List[OAuthOrgInfo]]
97
+
98
+
99
+ @dataclass
100
+ class OAuthInfo:
101
+ """
102
+ Information about the OAuth login.
103
+
104
+ Attributes:
105
+ access_token (`str`):
106
+ The access token.
107
+ access_token_expires_at (`datetime.datetime`):
108
+ The expiration date of the access token.
109
+ user_info ([`OAuthUserInfo`]):
110
+ The user information.
111
+ state (`str`, *optional*):
112
+ State passed to the OAuth provider in the original request to the OAuth provider.
113
+ scope (`str`):
114
+ Granted scope.
115
+ """
116
+
117
+ access_token: str
118
+ access_token_expires_at: datetime.datetime
119
+ user_info: OAuthUserInfo
120
+ state: Optional[str]
121
+ scope: str
122
+
123
+
124
+ @experimental
125
+ def attach_huggingface_oauth(app: "fastapi.FastAPI", route_prefix: str = "/"):
126
+ """
127
+ Add OAuth endpoints to a FastAPI app to enable OAuth login with Hugging Face.
128
+
129
+ How to use:
130
+ - Call this method on your FastAPI app to add the OAuth endpoints.
131
+ - Inside your route handlers, call `parse_huggingface_oauth(request)` to retrieve the OAuth info.
132
+ - If user is logged in, an [`OAuthInfo`] object is returned with the user's info. If not, `None` is returned.
133
+ - In your app, make sure to add links to `/oauth/huggingface/login` and `/oauth/huggingface/logout` for the user to log in and out.
134
+
135
+ Example:
136
+ ```py
137
+ from huggingface_hub import attach_huggingface_oauth, parse_huggingface_oauth
138
+
139
+ # Create a FastAPI app
140
+ app = FastAPI()
141
+
142
+ # Add OAuth endpoints to the FastAPI app
143
+ attach_huggingface_oauth(app)
144
+
145
+ # Add a route that greets the user if they are logged in
146
+ @app.get("/")
147
+ def greet_json(request: Request):
148
+ # Retrieve the OAuth info from the request
149
+ oauth_info = parse_huggingface_oauth(request) # e.g. OAuthInfo dataclass
150
+ if oauth_info is None:
151
+ return {"msg": "Not logged in!"}
152
+ return {"msg": f"Hello, {oauth_info.user_info.preferred_username}!"}
153
+ ```
154
+ """
155
+ # TODO: handle generic case (handling OAuth in a non-Space environment with custom dev values) (low priority)
156
+
157
+ # Add SessionMiddleware to the FastAPI app to store the OAuth info in the session.
158
+ # Session Middleware requires a secret key to sign the cookies. Let's use a hash
159
+ # of the OAuth secret key to make it unique to the Space + updated in case OAuth
160
+ # config gets updated. When ran locally, we use an empty string as a secret key.
161
+ try:
162
+ from starlette.middleware.sessions import SessionMiddleware
163
+ except ImportError as e:
164
+ raise ImportError(
165
+ "Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
166
+ "`huggingface_hub[oauth]` to your requirements.txt file in order to install the required dependencies."
167
+ ) from e
168
+ session_secret = (constants.OAUTH_CLIENT_SECRET or "") + "-v1"
169
+ app.add_middleware(
170
+ SessionMiddleware, # type: ignore[arg-type]
171
+ secret_key=hashlib.sha256(session_secret.encode()).hexdigest(),
172
+ same_site="none",
173
+ https_only=True,
174
+ ) # type: ignore
175
+
176
+ # Add OAuth endpoints to the FastAPI app:
177
+ # - {route_prefix}/oauth/huggingface/login
178
+ # - {route_prefix}/oauth/huggingface/callback
179
+ # - {route_prefix}/oauth/huggingface/logout
180
+ # If the app is running in a Space, OAuth is enabled normally.
181
+ # Otherwise, we mock the endpoints to make the user log in with a fake user profile - without any calls to hf.co.
182
+ route_prefix = route_prefix.strip("/")
183
+ if os.getenv("SPACE_ID") is not None:
184
+ logger.info("OAuth is enabled in the Space. Adding OAuth routes.")
185
+ _add_oauth_routes(app, route_prefix=route_prefix)
186
+ else:
187
+ logger.info("App is not running in a Space. Adding mocked OAuth routes.")
188
+ _add_mocked_oauth_routes(app, route_prefix=route_prefix)
189
+
190
+
191
+ def parse_huggingface_oauth(request: "fastapi.Request") -> Optional[OAuthInfo]:
192
+ """
193
+ Returns the information from a logged in user as a [`OAuthInfo`] object.
194
+
195
+ For flexibility and future-proofing, this method is very lax in its parsing and does not raise errors.
196
+ Missing fields are set to `None` without a warning.
197
+
198
+ Return `None`, if the user is not logged in (no info in session cookie).
199
+
200
+ See [`attach_huggingface_oauth`] for an example on how to use this method.
201
+ """
202
+ if "oauth_info" not in request.session:
203
+ logger.debug("No OAuth info in session.")
204
+ return None
205
+
206
+ logger.debug("Parsing OAuth info from session.")
207
+ oauth_data = request.session["oauth_info"]
208
+ user_data = oauth_data.get("userinfo", {})
209
+ orgs_data = user_data.get("orgs", [])
210
+
211
+ orgs = (
212
+ [
213
+ OAuthOrgInfo(
214
+ sub=org.get("sub"),
215
+ name=org.get("name"),
216
+ preferred_username=org.get("preferred_username"),
217
+ picture=org.get("picture"),
218
+ is_enterprise=org.get("isEnterprise"),
219
+ can_pay=org.get("canPay"),
220
+ role_in_org=org.get("roleInOrg"),
221
+ security_restrictions=org.get("securityRestrictions"),
222
+ )
223
+ for org in orgs_data
224
+ ]
225
+ if orgs_data
226
+ else None
227
+ )
228
+
229
+ user_info = OAuthUserInfo(
230
+ sub=user_data.get("sub"),
231
+ name=user_data.get("name"),
232
+ preferred_username=user_data.get("preferred_username"),
233
+ email_verified=user_data.get("email_verified"),
234
+ email=user_data.get("email"),
235
+ picture=user_data.get("picture"),
236
+ profile=user_data.get("profile"),
237
+ website=user_data.get("website"),
238
+ is_pro=user_data.get("isPro"),
239
+ can_pay=user_data.get("canPay"),
240
+ orgs=orgs,
241
+ )
242
+
243
+ return OAuthInfo(
244
+ access_token=oauth_data.get("access_token"),
245
+ access_token_expires_at=datetime.datetime.fromtimestamp(oauth_data.get("expires_at")),
246
+ user_info=user_info,
247
+ state=oauth_data.get("state"),
248
+ scope=oauth_data.get("scope"),
249
+ )
250
+
251
+
252
+ def _add_oauth_routes(app: "fastapi.FastAPI", route_prefix: str) -> None:
253
+ """Add OAuth routes to the FastAPI app (login, callback handler and logout)."""
254
+ try:
255
+ import fastapi
256
+ from authlib.integrations.base_client.errors import MismatchingStateError
257
+ from authlib.integrations.starlette_client import OAuth
258
+ from fastapi.responses import RedirectResponse
259
+ except ImportError as e:
260
+ raise ImportError(
261
+ "Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
262
+ "`huggingface_hub[oauth]` to your requirements.txt file."
263
+ ) from e
264
+
265
+ # Check environment variables
266
+ msg = (
267
+ "OAuth is required but '{}' environment variable is not set. Make sure you've enabled OAuth in your Space by"
268
+ " setting `hf_oauth: true` in the Space metadata."
269
+ )
270
+ if constants.OAUTH_CLIENT_ID is None:
271
+ raise ValueError(msg.format("OAUTH_CLIENT_ID"))
272
+ if constants.OAUTH_CLIENT_SECRET is None:
273
+ raise ValueError(msg.format("OAUTH_CLIENT_SECRET"))
274
+ if constants.OAUTH_SCOPES is None:
275
+ raise ValueError(msg.format("OAUTH_SCOPES"))
276
+ if constants.OPENID_PROVIDER_URL is None:
277
+ raise ValueError(msg.format("OPENID_PROVIDER_URL"))
278
+
279
+ # Register OAuth server
280
+ oauth = OAuth()
281
+ oauth.register(
282
+ name="huggingface",
283
+ client_id=constants.OAUTH_CLIENT_ID,
284
+ client_secret=constants.OAUTH_CLIENT_SECRET,
285
+ client_kwargs={"scope": constants.OAUTH_SCOPES},
286
+ server_metadata_url=constants.OPENID_PROVIDER_URL + "/.well-known/openid-configuration",
287
+ )
288
+
289
+ login_uri, callback_uri, logout_uri = _get_oauth_uris(route_prefix)
290
+
291
+ # Register OAuth endpoints
292
+ @app.get(login_uri)
293
+ async def oauth_login(request: fastapi.Request) -> RedirectResponse:
294
+ """Endpoint that redirects to HF OAuth page."""
295
+ redirect_uri = _generate_redirect_uri(request)
296
+ return await oauth.huggingface.authorize_redirect(request, redirect_uri) # type: ignore
297
+
298
+ @app.get(callback_uri)
299
+ async def oauth_redirect_callback(request: fastapi.Request) -> RedirectResponse:
300
+ """Endpoint that handles the OAuth callback."""
301
+ try:
302
+ oauth_info = await oauth.huggingface.authorize_access_token(request) # type: ignore
303
+ except MismatchingStateError:
304
+ # Parse query params
305
+ nb_redirects = int(request.query_params.get("_nb_redirects", 0))
306
+ target_url = request.query_params.get("_target_url")
307
+
308
+ # Build redirect URI with the same query params as before and bump nb_redirects count
309
+ query_params: Dict[str, Union[int, str]] = {"_nb_redirects": nb_redirects + 1}
310
+ if target_url:
311
+ query_params["_target_url"] = target_url
312
+
313
+ redirect_uri = f"{login_uri}?{urllib.parse.urlencode(query_params)}"
314
+
315
+ # If the user is redirected more than 3 times, it is very likely that the cookie is not working properly.
316
+ # (e.g. browser is blocking third-party cookies in iframe). In this case, redirect the user in the
317
+ # non-iframe view.
318
+ if nb_redirects > constants.OAUTH_MAX_REDIRECTS:
319
+ host = os.environ.get("SPACE_HOST")
320
+ if host is None: # cannot happen in a Space
321
+ raise RuntimeError(
322
+ "App is not running in a Space (SPACE_HOST environment variable is not set). Cannot redirect to non-iframe view."
323
+ ) from None
324
+ host_url = "https://" + host.rstrip("/")
325
+ return RedirectResponse(host_url + redirect_uri)
326
+
327
+ # Redirect the user to the login page again
328
+ return RedirectResponse(redirect_uri)
329
+
330
+ # OAuth login worked => store the user info in the session and redirect
331
+ logger.debug("Successfully logged in with OAuth. Storing user info in session.")
332
+ request.session["oauth_info"] = oauth_info
333
+ return RedirectResponse(_get_redirect_target(request))
334
+
335
+ @app.get(logout_uri)
336
+ async def oauth_logout(request: fastapi.Request) -> RedirectResponse:
337
+ """Endpoint that logs out the user (e.g. delete info from cookie session)."""
338
+ logger.debug("Logged out with OAuth. Removing user info from session.")
339
+ request.session.pop("oauth_info", None)
340
+ return RedirectResponse(_get_redirect_target(request))
341
+
342
+
343
+ def _add_mocked_oauth_routes(app: "fastapi.FastAPI", route_prefix: str = "/") -> None:
344
+ """Add fake oauth routes if app is run locally and OAuth is enabled.
345
+
346
+ Using OAuth will have the same behavior as in a Space but instead of authenticating with HF, a mocked user profile
347
+ is added to the session.
348
+ """
349
+ try:
350
+ import fastapi
351
+ from fastapi.responses import RedirectResponse
352
+ from starlette.datastructures import URL
353
+ except ImportError as e:
354
+ raise ImportError(
355
+ "Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
356
+ "`huggingface_hub[oauth]` to your requirements.txt file."
357
+ ) from e
358
+
359
+ warnings.warn(
360
+ "OAuth is not supported outside of a Space environment. To help you debug your app locally, the oauth endpoints"
361
+ " are mocked to return your profile and token. To make it work, your machine must be logged in to Huggingface."
362
+ )
363
+ mocked_oauth_info = _get_mocked_oauth_info()
364
+
365
+ login_uri, callback_uri, logout_uri = _get_oauth_uris(route_prefix)
366
+
367
+ # Define OAuth routes
368
+ @app.get(login_uri)
369
+ async def oauth_login(request: fastapi.Request) -> RedirectResponse:
370
+ """Fake endpoint that redirects to HF OAuth page."""
371
+ # Define target (where to redirect after login)
372
+ redirect_uri = _generate_redirect_uri(request)
373
+ return RedirectResponse(callback_uri + "?" + urllib.parse.urlencode({"_target_url": redirect_uri}))
374
+
375
+ @app.get(callback_uri)
376
+ async def oauth_redirect_callback(request: fastapi.Request) -> RedirectResponse:
377
+ """Endpoint that handles the OAuth callback."""
378
+ request.session["oauth_info"] = mocked_oauth_info
379
+ return RedirectResponse(_get_redirect_target(request))
380
+
381
+ @app.get(logout_uri)
382
+ async def oauth_logout(request: fastapi.Request) -> RedirectResponse:
383
+ """Endpoint that logs out the user (e.g. delete cookie session)."""
384
+ request.session.pop("oauth_info", None)
385
+ logout_url = URL("/").include_query_params(**request.query_params)
386
+ return RedirectResponse(url=logout_url, status_code=302) # see https://github.com/gradio-app/gradio/pull/9659
387
+
388
+
389
+ def _generate_redirect_uri(request: "fastapi.Request") -> str:
390
+ if "_target_url" in request.query_params:
391
+ # if `_target_url` already in query params => respect it
392
+ target = request.query_params["_target_url"]
393
+ else:
394
+ # otherwise => keep query params
395
+ target = "/?" + urllib.parse.urlencode(request.query_params)
396
+
397
+ redirect_uri = request.url_for("oauth_redirect_callback").include_query_params(_target_url=target)
398
+ redirect_uri_as_str = str(redirect_uri)
399
+ if redirect_uri.netloc.endswith(".hf.space"):
400
+ # In Space, FastAPI redirect as http but we want https
401
+ redirect_uri_as_str = redirect_uri_as_str.replace("http://", "https://")
402
+ return redirect_uri_as_str
403
+
404
+
405
+ def _get_redirect_target(request: "fastapi.Request", default_target: str = "/") -> str:
406
+ return request.query_params.get("_target_url", default_target)
407
+
408
+
409
+ def _get_mocked_oauth_info() -> Dict:
410
+ token = get_token()
411
+ if token is None:
412
+ raise ValueError(
413
+ "Your machine must be logged in to HF to debug an OAuth app locally. Please"
414
+ " run `hf auth login` or set `HF_TOKEN` as environment variable "
415
+ "with one of your access token. You can generate a new token in your "
416
+ "settings page (https://huggingface.co/settings/tokens)."
417
+ )
418
+
419
+ user = whoami()
420
+ if user["type"] != "user":
421
+ raise ValueError(
422
+ "Your machine is not logged in with a personal account. Please use a "
423
+ "personal access token. You can generate a new token in your settings page"
424
+ " (https://huggingface.co/settings/tokens)."
425
+ )
426
+
427
+ return {
428
+ "access_token": token,
429
+ "token_type": "bearer",
430
+ "expires_in": 8 * 60 * 60, # 8 hours
431
+ "id_token": "FOOBAR",
432
+ "scope": "openid profile",
433
+ "refresh_token": "hf_oauth__refresh_token",
434
+ "expires_at": int(time.time()) + 8 * 60 * 60, # 8 hours
435
+ "userinfo": {
436
+ "sub": "0123456789",
437
+ "name": user["fullname"],
438
+ "preferred_username": user["name"],
439
+ "profile": f"https://huggingface.co/{user['name']}",
440
+ "picture": user["avatarUrl"],
441
+ "website": "",
442
+ "aud": "00000000-0000-0000-0000-000000000000",
443
+ "auth_time": 1691672844,
444
+ "nonce": "aaaaaaaaaaaaaaaaaaa",
445
+ "iat": 1691672844,
446
+ "exp": 1691676444,
447
+ "iss": "https://huggingface.co",
448
+ },
449
+ }
450
+
451
+
452
+ def _get_oauth_uris(route_prefix: str = "/") -> Tuple[str, str, str]:
453
+ route_prefix = route_prefix.strip("/")
454
+ if route_prefix:
455
+ route_prefix = f"/{route_prefix}"
456
+ return (
457
+ f"{route_prefix}/oauth/huggingface/login",
458
+ f"{route_prefix}/oauth/huggingface/callback",
459
+ f"{route_prefix}/oauth/huggingface/logout",
460
+ )
venv/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Dict, Iterable, List, Literal, Optional, Type, Union
4
+
5
+ import requests
6
+ from tqdm.auto import tqdm as base_tqdm
7
+ from tqdm.contrib.concurrent import thread_map
8
+
9
+ from . import constants
10
+ from .errors import (
11
+ GatedRepoError,
12
+ HfHubHTTPError,
13
+ LocalEntryNotFoundError,
14
+ RepositoryNotFoundError,
15
+ RevisionNotFoundError,
16
+ )
17
+ from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
18
+ from .hf_api import DatasetInfo, HfApi, ModelInfo, RepoFile, SpaceInfo
19
+ from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
20
+ from .utils import tqdm as hf_tqdm
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ VERY_LARGE_REPO_THRESHOLD = 50000 # After this limit, we don't consider `repo_info.siblings` to be reliable enough
26
+
27
+
28
+ @validate_hf_hub_args
29
+ def snapshot_download(
30
+ repo_id: str,
31
+ *,
32
+ repo_type: Optional[str] = None,
33
+ revision: Optional[str] = None,
34
+ cache_dir: Union[str, Path, None] = None,
35
+ local_dir: Union[str, Path, None] = None,
36
+ library_name: Optional[str] = None,
37
+ library_version: Optional[str] = None,
38
+ user_agent: Optional[Union[Dict, str]] = None,
39
+ proxies: Optional[Dict] = None,
40
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
41
+ force_download: bool = False,
42
+ token: Optional[Union[bool, str]] = None,
43
+ local_files_only: bool = False,
44
+ allow_patterns: Optional[Union[List[str], str]] = None,
45
+ ignore_patterns: Optional[Union[List[str], str]] = None,
46
+ max_workers: int = 8,
47
+ tqdm_class: Optional[Type[base_tqdm]] = None,
48
+ headers: Optional[Dict[str, str]] = None,
49
+ endpoint: Optional[str] = None,
50
+ # Deprecated args
51
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
52
+ resume_download: Optional[bool] = None,
53
+ ) -> str:
54
+ """Download repo files.
55
+
56
+ Download a whole snapshot of a repo's files at the specified revision. This is useful when you want all files from
57
+ a repo, because you don't know which ones you will need a priori. All files are nested inside a folder in order
58
+ to keep their actual filename relative to that folder. You can also filter which files to download using
59
+ `allow_patterns` and `ignore_patterns`.
60
+
61
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
62
+ option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
63
+ to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
64
+ cache-system, it's optimized for regularly pulling the latest version of a repository.
65
+
66
+ An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly
67
+ configured. It is also not possible to filter which files to download when cloning a repository using git.
68
+
69
+ Args:
70
+ repo_id (`str`):
71
+ A user or an organization name and a repo name separated by a `/`.
72
+ repo_type (`str`, *optional*):
73
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
74
+ `None` or `"model"` if downloading from a model. Default is `None`.
75
+ revision (`str`, *optional*):
76
+ An optional Git revision id which can be a branch name, a tag, or a
77
+ commit hash.
78
+ cache_dir (`str`, `Path`, *optional*):
79
+ Path to the folder where cached files are stored.
80
+ local_dir (`str` or `Path`, *optional*):
81
+ If provided, the downloaded files will be placed under this directory.
82
+ library_name (`str`, *optional*):
83
+ The name of the library to which the object corresponds.
84
+ library_version (`str`, *optional*):
85
+ The version of the library.
86
+ user_agent (`str`, `dict`, *optional*):
87
+ The user-agent info in the form of a dictionary or a string.
88
+ proxies (`dict`, *optional*):
89
+ Dictionary mapping protocol to the URL of the proxy passed to
90
+ `requests.request`.
91
+ etag_timeout (`float`, *optional*, defaults to `10`):
92
+ When fetching ETag, how many seconds to wait for the server to send
93
+ data before giving up which is passed to `requests.request`.
94
+ force_download (`bool`, *optional*, defaults to `False`):
95
+ Whether the file should be downloaded even if it already exists in the local cache.
96
+ token (`str`, `bool`, *optional*):
97
+ A token to be used for the download.
98
+ - If `True`, the token is read from the HuggingFace config
99
+ folder.
100
+ - If a string, it's used as the authentication token.
101
+ headers (`dict`, *optional*):
102
+ Additional headers to include in the request. Those headers take precedence over the others.
103
+ local_files_only (`bool`, *optional*, defaults to `False`):
104
+ If `True`, avoid downloading the file and return the path to the
105
+ local cached file if it exists.
106
+ allow_patterns (`List[str]` or `str`, *optional*):
107
+ If provided, only files matching at least one pattern are downloaded.
108
+ ignore_patterns (`List[str]` or `str`, *optional*):
109
+ If provided, files matching any of the patterns are not downloaded.
110
+ max_workers (`int`, *optional*):
111
+ Number of concurrent threads to download files (1 thread = 1 file download).
112
+ Defaults to 8.
113
+ tqdm_class (`tqdm`, *optional*):
114
+ If provided, overwrites the default behavior for the progress bar. Passed
115
+ argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
116
+ Note that the `tqdm_class` is not passed to each individual download.
117
+ Defaults to the custom HF progress bar that can be disabled by setting
118
+ `HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
119
+
120
+ Returns:
121
+ `str`: folder path of the repo snapshot.
122
+
123
+ Raises:
124
+ [`~utils.RepositoryNotFoundError`]
125
+ If the repository to download from cannot be found. This may be because it doesn't exist,
126
+ or because it is set to `private` and you do not have access.
127
+ [`~utils.RevisionNotFoundError`]
128
+ If the revision to download from cannot be found.
129
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
130
+ If `token=True` and the token cannot be found.
131
+ [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
132
+ ETag cannot be determined.
133
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
134
+ if some parameter value is invalid.
135
+ """
136
+ if cache_dir is None:
137
+ cache_dir = constants.HF_HUB_CACHE
138
+ if revision is None:
139
+ revision = constants.DEFAULT_REVISION
140
+ if isinstance(cache_dir, Path):
141
+ cache_dir = str(cache_dir)
142
+
143
+ if repo_type is None:
144
+ repo_type = "model"
145
+ if repo_type not in constants.REPO_TYPES:
146
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
147
+
148
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
149
+
150
+ api = HfApi(
151
+ library_name=library_name,
152
+ library_version=library_version,
153
+ user_agent=user_agent,
154
+ endpoint=endpoint,
155
+ headers=headers,
156
+ token=token,
157
+ )
158
+
159
+ repo_info: Union[ModelInfo, DatasetInfo, SpaceInfo, None] = None
160
+ api_call_error: Optional[Exception] = None
161
+ if not local_files_only:
162
+ # try/except logic to handle different errors => taken from `hf_hub_download`
163
+ try:
164
+ # if we have internet connection we want to list files to download
165
+ repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision)
166
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
167
+ # Actually raise for those subclasses of ConnectionError
168
+ raise
169
+ except (
170
+ requests.exceptions.ConnectionError,
171
+ requests.exceptions.Timeout,
172
+ OfflineModeIsEnabled,
173
+ ) as error:
174
+ # Internet connection is down
175
+ # => will try to use local files only
176
+ api_call_error = error
177
+ pass
178
+ except RevisionNotFoundError:
179
+ # The repo was found but the revision doesn't exist on the Hub (never existed or got deleted)
180
+ raise
181
+ except requests.HTTPError as error:
182
+ # Multiple reasons for an http error:
183
+ # - Repository is private and invalid/missing token sent
184
+ # - Repository is gated and invalid/missing token sent
185
+ # - Hub is down (error 500 or 504)
186
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
187
+ # (if it's not the case, the error will be re-raised)
188
+ api_call_error = error
189
+ pass
190
+
191
+ # At this stage, if `repo_info` is None it means either:
192
+ # - internet connection is down
193
+ # - internet connection is deactivated (local_files_only=True or HF_HUB_OFFLINE=True)
194
+ # - repo is private/gated and invalid/missing token sent
195
+ # - Hub is down
196
+ # => let's look if we can find the appropriate folder in the cache:
197
+ # - if the specified revision is a commit hash, look inside "snapshots".
198
+ # - f the specified revision is a branch or tag, look inside "refs".
199
+ # => if local_dir is not None, we will return the path to the local folder if it exists.
200
+ if repo_info is None:
201
+ # Try to get which commit hash corresponds to the specified revision
202
+ commit_hash = None
203
+ if REGEX_COMMIT_HASH.match(revision):
204
+ commit_hash = revision
205
+ else:
206
+ ref_path = os.path.join(storage_folder, "refs", revision)
207
+ if os.path.exists(ref_path):
208
+ # retrieve commit_hash from refs file
209
+ with open(ref_path) as f:
210
+ commit_hash = f.read()
211
+
212
+ # Try to locate snapshot folder for this commit hash
213
+ if commit_hash is not None and local_dir is None:
214
+ snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
215
+ if os.path.exists(snapshot_folder):
216
+ # Snapshot folder exists => let's return it
217
+ # (but we can't check if all the files are actually there)
218
+ return snapshot_folder
219
+
220
+ # If local_dir is not None, return it if it exists and is not empty
221
+ if local_dir is not None:
222
+ local_dir = Path(local_dir)
223
+ if local_dir.is_dir() and any(local_dir.iterdir()):
224
+ logger.warning(
225
+ f"Returning existing local_dir `{local_dir}` as remote repo cannot be accessed in `snapshot_download` ({api_call_error})."
226
+ )
227
+ return str(local_dir.resolve())
228
+ # If we couldn't find the appropriate folder on disk, raise an error.
229
+ if local_files_only:
230
+ raise LocalEntryNotFoundError(
231
+ "Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
232
+ "outgoing traffic has been disabled. To enable repo look-ups and downloads online, pass "
233
+ "'local_files_only=False' as input."
234
+ )
235
+ elif isinstance(api_call_error, OfflineModeIsEnabled):
236
+ raise LocalEntryNotFoundError(
237
+ "Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
238
+ "outgoing traffic has been disabled. To enable repo look-ups and downloads online, set "
239
+ "'HF_HUB_OFFLINE=0' as environment variable."
240
+ ) from api_call_error
241
+ elif isinstance(api_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
242
+ isinstance(api_call_error, HfHubHTTPError) and api_call_error.response.status_code == 401
243
+ ):
244
+ # Repo not found, gated, or specific authentication error => let's raise the actual error
245
+ raise api_call_error
246
+ else:
247
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
248
+ raise LocalEntryNotFoundError(
249
+ "An error happened while trying to locate the files on the Hub and we cannot find the appropriate"
250
+ " snapshot folder for the specified revision on the local disk. Please check your internet connection"
251
+ " and try again."
252
+ ) from api_call_error
253
+
254
+ # At this stage, internet connection is up and running
255
+ # => let's download the files!
256
+ assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
257
+
258
+ # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
259
+ # In that case, we need to use the `list_repo_tree` method to prevent caching issues.
260
+ repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] if repo_info.siblings is not None else []
261
+ unreliable_nb_files = (
262
+ repo_info.siblings is None
263
+ or len(repo_info.siblings) == 0
264
+ or len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD
265
+ )
266
+ if unreliable_nb_files:
267
+ logger.info(
268
+ "Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed."
269
+ )
270
+ repo_files = (
271
+ f.rfilename
272
+ for f in api.list_repo_tree(repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type)
273
+ if isinstance(f, RepoFile)
274
+ )
275
+
276
+ filtered_repo_files: Iterable[str] = filter_repo_objects(
277
+ items=repo_files,
278
+ allow_patterns=allow_patterns,
279
+ ignore_patterns=ignore_patterns,
280
+ )
281
+
282
+ if not unreliable_nb_files:
283
+ filtered_repo_files = list(filtered_repo_files)
284
+ tqdm_desc = f"Fetching {len(filtered_repo_files)} files"
285
+ else:
286
+ tqdm_desc = "Fetching ... files"
287
+
288
+ commit_hash = repo_info.sha
289
+ snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
290
+ # if passed revision is not identical to commit_hash
291
+ # then revision has to be a branch name or tag name.
292
+ # In that case store a ref.
293
+ if revision != commit_hash:
294
+ ref_path = os.path.join(storage_folder, "refs", revision)
295
+ try:
296
+ os.makedirs(os.path.dirname(ref_path), exist_ok=True)
297
+ with open(ref_path, "w") as f:
298
+ f.write(commit_hash)
299
+ except OSError as e:
300
+ logger.warning(f"Ignored error while writing commit hash to {ref_path}: {e}.")
301
+
302
+ # we pass the commit_hash to hf_hub_download
303
+ # so no network call happens if we already
304
+ # have the file locally.
305
+ def _inner_hf_hub_download(repo_file: str):
306
+ return hf_hub_download(
307
+ repo_id,
308
+ filename=repo_file,
309
+ repo_type=repo_type,
310
+ revision=commit_hash,
311
+ endpoint=endpoint,
312
+ cache_dir=cache_dir,
313
+ local_dir=local_dir,
314
+ local_dir_use_symlinks=local_dir_use_symlinks,
315
+ library_name=library_name,
316
+ library_version=library_version,
317
+ user_agent=user_agent,
318
+ proxies=proxies,
319
+ etag_timeout=etag_timeout,
320
+ resume_download=resume_download,
321
+ force_download=force_download,
322
+ token=token,
323
+ headers=headers,
324
+ )
325
+
326
+ if constants.HF_HUB_ENABLE_HF_TRANSFER:
327
+ # when using hf_transfer we don't want extra parallelism
328
+ # from the one hf_transfer provides
329
+ for file in filtered_repo_files:
330
+ _inner_hf_hub_download(file)
331
+ else:
332
+ thread_map(
333
+ _inner_hf_hub_download,
334
+ filtered_repo_files,
335
+ desc=tqdm_desc,
336
+ max_workers=max_workers,
337
+ # User can use its own tqdm class or the default one from `huggingface_hub.utils`
338
+ tqdm_class=tqdm_class or hf_tqdm,
339
+ )
340
+
341
+ if local_dir is not None:
342
+ return str(os.path.realpath(local_dir))
343
+ return snapshot_folder
venv/lib/python3.13/site-packages/huggingface_hub/_space_api.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from dataclasses import dataclass
16
+ from datetime import datetime
17
+ from enum import Enum
18
+ from typing import Dict, Optional
19
+
20
+ from huggingface_hub.utils import parse_datetime
21
+
22
+
23
+ class SpaceStage(str, Enum):
24
+ """
25
+ Enumeration of possible stage of a Space on the Hub.
26
+
27
+ Value can be compared to a string:
28
+ ```py
29
+ assert SpaceStage.BUILDING == "BUILDING"
30
+ ```
31
+
32
+ Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceInfo.ts#L61 (private url).
33
+ """
34
+
35
+ # Copied from moon-landing > server > repo_types > SpaceInfo.ts (private repo)
36
+ NO_APP_FILE = "NO_APP_FILE"
37
+ CONFIG_ERROR = "CONFIG_ERROR"
38
+ BUILDING = "BUILDING"
39
+ BUILD_ERROR = "BUILD_ERROR"
40
+ RUNNING = "RUNNING"
41
+ RUNNING_BUILDING = "RUNNING_BUILDING"
42
+ RUNTIME_ERROR = "RUNTIME_ERROR"
43
+ DELETING = "DELETING"
44
+ STOPPED = "STOPPED"
45
+ PAUSED = "PAUSED"
46
+
47
+
48
+ class SpaceHardware(str, Enum):
49
+ """
50
+ Enumeration of hardwares available to run your Space on the Hub.
51
+
52
+ Value can be compared to a string:
53
+ ```py
54
+ assert SpaceHardware.CPU_BASIC == "cpu-basic"
55
+ ```
56
+
57
+ Taken from https://github.com/huggingface-internal/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts (private url).
58
+ """
59
+
60
+ # CPU
61
+ CPU_BASIC = "cpu-basic"
62
+ CPU_UPGRADE = "cpu-upgrade"
63
+ CPU_XL = "cpu-xl"
64
+
65
+ # ZeroGPU
66
+ ZERO_A10G = "zero-a10g"
67
+
68
+ # GPU
69
+ T4_SMALL = "t4-small"
70
+ T4_MEDIUM = "t4-medium"
71
+ L4X1 = "l4x1"
72
+ L4X4 = "l4x4"
73
+ L40SX1 = "l40sx1"
74
+ L40SX4 = "l40sx4"
75
+ L40SX8 = "l40sx8"
76
+ A10G_SMALL = "a10g-small"
77
+ A10G_LARGE = "a10g-large"
78
+ A10G_LARGEX2 = "a10g-largex2"
79
+ A10G_LARGEX4 = "a10g-largex4"
80
+ A100_LARGE = "a100-large"
81
+ H100 = "h100"
82
+ H100X8 = "h100x8"
83
+
84
+
85
+ class SpaceStorage(str, Enum):
86
+ """
87
+ Enumeration of persistent storage available for your Space on the Hub.
88
+
89
+ Value can be compared to a string:
90
+ ```py
91
+ assert SpaceStorage.SMALL == "small"
92
+ ```
93
+
94
+ Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts#L24 (private url).
95
+ """
96
+
97
+ SMALL = "small"
98
+ MEDIUM = "medium"
99
+ LARGE = "large"
100
+
101
+
102
+ @dataclass
103
+ class SpaceRuntime:
104
+ """
105
+ Contains information about the current runtime of a Space.
106
+
107
+ Args:
108
+ stage (`str`):
109
+ Current stage of the space. Example: RUNNING.
110
+ hardware (`str` or `None`):
111
+ Current hardware of the space. Example: "cpu-basic". Can be `None` if Space
112
+ is `BUILDING` for the first time.
113
+ requested_hardware (`str` or `None`):
114
+ Requested hardware. Can be different than `hardware` especially if the request
115
+ has just been made. Example: "t4-medium". Can be `None` if no hardware has
116
+ been requested yet.
117
+ sleep_time (`int` or `None`):
118
+ Number of seconds the Space will be kept alive after the last request. By default (if value is `None`), the
119
+ Space will never go to sleep if it's running on an upgraded hardware, while it will go to sleep after 48
120
+ hours on a free 'cpu-basic' hardware. For more details, see https://huggingface.co/docs/hub/spaces-gpus#sleep-time.
121
+ raw (`dict`):
122
+ Raw response from the server. Contains more information about the Space
123
+ runtime like number of replicas, number of cpu, memory size,...
124
+ """
125
+
126
+ stage: SpaceStage
127
+ hardware: Optional[SpaceHardware]
128
+ requested_hardware: Optional[SpaceHardware]
129
+ sleep_time: Optional[int]
130
+ storage: Optional[SpaceStorage]
131
+ raw: Dict
132
+
133
+ def __init__(self, data: Dict) -> None:
134
+ self.stage = data["stage"]
135
+ self.hardware = data.get("hardware", {}).get("current")
136
+ self.requested_hardware = data.get("hardware", {}).get("requested")
137
+ self.sleep_time = data.get("gcTimeout")
138
+ self.storage = data.get("storage")
139
+ self.raw = data
140
+
141
+
142
+ @dataclass
143
+ class SpaceVariable:
144
+ """
145
+ Contains information about the current variables of a Space.
146
+
147
+ Args:
148
+ key (`str`):
149
+ Variable key. Example: `"MODEL_REPO_ID"`
150
+ value (`str`):
151
+ Variable value. Example: `"the_model_repo_id"`.
152
+ description (`str` or None):
153
+ Description of the variable. Example: `"Model Repo ID of the implemented model"`.
154
+ updatedAt (`datetime` or None):
155
+ datetime of the last update of the variable (if the variable has been updated at least once).
156
+ """
157
+
158
+ key: str
159
+ value: str
160
+ description: Optional[str]
161
+ updated_at: Optional[datetime]
162
+
163
+ def __init__(self, key: str, values: Dict) -> None:
164
+ self.key = key
165
+ self.value = values["value"]
166
+ self.description = values.get("description")
167
+ updated_at = values.get("updatedAt")
168
+ self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
venv/lib/python3.13/site-packages/huggingface_hub/_tensorboard_logger.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains a logger to push training logs to the Hub, using Tensorboard."""
15
+
16
+ from pathlib import Path
17
+ from typing import List, Optional, Union
18
+
19
+ from ._commit_scheduler import CommitScheduler
20
+ from .errors import EntryNotFoundError
21
+ from .repocard import ModelCard
22
+ from .utils import experimental
23
+
24
+
25
+ # Depending on user's setup, SummaryWriter can come either from 'tensorboardX'
26
+ # or from 'torch.utils.tensorboard'. Both are compatible so let's try to load
27
+ # from either of them.
28
+ try:
29
+ from tensorboardX import SummaryWriter as _RuntimeSummaryWriter
30
+
31
+ is_summary_writer_available = True
32
+ except ImportError:
33
+ try:
34
+ from torch.utils.tensorboard import SummaryWriter as _RuntimeSummaryWriter
35
+
36
+ is_summary_writer_available = True
37
+ except ImportError:
38
+ # Dummy class to avoid failing at import. Will raise on instance creation.
39
+ class _DummySummaryWriter:
40
+ pass
41
+
42
+ _RuntimeSummaryWriter = _DummySummaryWriter # type: ignore[assignment]
43
+ is_summary_writer_available = False
44
+
45
+
46
+ class HFSummaryWriter(_RuntimeSummaryWriter):
47
+ """
48
+ Wrapper around the tensorboard's `SummaryWriter` to push training logs to the Hub.
49
+
50
+ Data is logged locally and then pushed to the Hub asynchronously. Pushing data to the Hub is done in a separate
51
+ thread to avoid blocking the training script. In particular, if the upload fails for any reason (e.g. a connection
52
+ issue), the main script will not be interrupted. Data is automatically pushed to the Hub every `commit_every`
53
+ minutes (default to every 5 minutes).
54
+
55
+ > [!WARNING]
56
+ > `HFSummaryWriter` is experimental. Its API is subject to change in the future without prior notice.
57
+
58
+ Args:
59
+ repo_id (`str`):
60
+ The id of the repo to which the logs will be pushed.
61
+ logdir (`str`, *optional*):
62
+ The directory where the logs will be written. If not specified, a local directory will be created by the
63
+ underlying `SummaryWriter` object.
64
+ commit_every (`int` or `float`, *optional*):
65
+ The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes.
66
+ squash_history (`bool`, *optional*):
67
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
68
+ useful to avoid degraded performances on the repo when it grows too large.
69
+ repo_type (`str`, *optional*):
70
+ The type of the repo to which the logs will be pushed. Defaults to "model".
71
+ repo_revision (`str`, *optional*):
72
+ The revision of the repo to which the logs will be pushed. Defaults to "main".
73
+ repo_private (`bool`, *optional*):
74
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
75
+ path_in_repo (`str`, *optional*):
76
+ The path to the folder in the repo where the logs will be pushed. Defaults to "tensorboard/".
77
+ repo_allow_patterns (`List[str]` or `str`, *optional*):
78
+ A list of patterns to include in the upload. Defaults to `"*.tfevents.*"`. Check out the
79
+ [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
80
+ repo_ignore_patterns (`List[str]` or `str`, *optional*):
81
+ A list of patterns to exclude in the upload. Check out the
82
+ [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
83
+ token (`str`, *optional*):
84
+ Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
85
+ details
86
+ kwargs:
87
+ Additional keyword arguments passed to `SummaryWriter`.
88
+
89
+ Examples:
90
+ ```diff
91
+ # Taken from https://pytorch.org/docs/stable/tensorboard.html
92
+ - from torch.utils.tensorboard import SummaryWriter
93
+ + from huggingface_hub import HFSummaryWriter
94
+
95
+ import numpy as np
96
+
97
+ - writer = SummaryWriter()
98
+ + writer = HFSummaryWriter(repo_id="username/my-trained-model")
99
+
100
+ for n_iter in range(100):
101
+ writer.add_scalar('Loss/train', np.random.random(), n_iter)
102
+ writer.add_scalar('Loss/test', np.random.random(), n_iter)
103
+ writer.add_scalar('Accuracy/train', np.random.random(), n_iter)
104
+ writer.add_scalar('Accuracy/test', np.random.random(), n_iter)
105
+ ```
106
+
107
+ ```py
108
+ >>> from huggingface_hub import HFSummaryWriter
109
+
110
+ # Logs are automatically pushed every 15 minutes (5 by default) + when exiting the context manager
111
+ >>> with HFSummaryWriter(repo_id="test_hf_logger", commit_every=15) as logger:
112
+ ... logger.add_scalar("a", 1)
113
+ ... logger.add_scalar("b", 2)
114
+ ```
115
+ """
116
+
117
+ @experimental
118
+ def __new__(cls, *args, **kwargs) -> "HFSummaryWriter":
119
+ if not is_summary_writer_available:
120
+ raise ImportError(
121
+ "You must have `tensorboard` installed to use `HFSummaryWriter`. Please run `pip install --upgrade"
122
+ " tensorboardX` first."
123
+ )
124
+ return super().__new__(cls)
125
+
126
+ def __init__(
127
+ self,
128
+ repo_id: str,
129
+ *,
130
+ logdir: Optional[str] = None,
131
+ commit_every: Union[int, float] = 5,
132
+ squash_history: bool = False,
133
+ repo_type: Optional[str] = None,
134
+ repo_revision: Optional[str] = None,
135
+ repo_private: Optional[bool] = None,
136
+ path_in_repo: Optional[str] = "tensorboard",
137
+ repo_allow_patterns: Optional[Union[List[str], str]] = "*.tfevents.*",
138
+ repo_ignore_patterns: Optional[Union[List[str], str]] = None,
139
+ token: Optional[str] = None,
140
+ **kwargs,
141
+ ):
142
+ # Initialize SummaryWriter
143
+ super().__init__(logdir=logdir, **kwargs)
144
+
145
+ # Check logdir has been correctly initialized and fail early otherwise. In practice, SummaryWriter takes care of it.
146
+ if not isinstance(self.logdir, str):
147
+ raise ValueError(f"`self.logdir` must be a string. Got '{self.logdir}' of type {type(self.logdir)}.")
148
+
149
+ # Append logdir name to `path_in_repo`
150
+ if path_in_repo is None or path_in_repo == "":
151
+ path_in_repo = Path(self.logdir).name
152
+ else:
153
+ path_in_repo = path_in_repo.strip("/") + "/" + Path(self.logdir).name
154
+
155
+ # Initialize scheduler
156
+ self.scheduler = CommitScheduler(
157
+ folder_path=self.logdir,
158
+ path_in_repo=path_in_repo,
159
+ repo_id=repo_id,
160
+ repo_type=repo_type,
161
+ revision=repo_revision,
162
+ private=repo_private,
163
+ token=token,
164
+ allow_patterns=repo_allow_patterns,
165
+ ignore_patterns=repo_ignore_patterns,
166
+ every=commit_every,
167
+ squash_history=squash_history,
168
+ )
169
+
170
+ # Exposing some high-level info at root level
171
+ self.repo_id = self.scheduler.repo_id
172
+ self.repo_type = self.scheduler.repo_type
173
+ self.repo_revision = self.scheduler.revision
174
+
175
+ # Add `hf-summary-writer` tag to the model card metadata
176
+ try:
177
+ card = ModelCard.load(repo_id_or_path=self.repo_id, repo_type=self.repo_type)
178
+ except EntryNotFoundError:
179
+ card = ModelCard("")
180
+ tags = card.data.get("tags", [])
181
+ if "hf-summary-writer" not in tags:
182
+ tags.append("hf-summary-writer")
183
+ card.data["tags"] = tags
184
+ card.push_to_hub(repo_id=self.repo_id, repo_type=self.repo_type)
185
+
186
+ def __exit__(self, exc_type, exc_val, exc_tb):
187
+ """Push to hub in a non-blocking way when exiting the logger's context manager."""
188
+ super().__exit__(exc_type, exc_val, exc_tb)
189
+ future = self.scheduler.trigger()
190
+ future.result()
venv/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py ADDED
@@ -0,0 +1,755 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import enum
16
+ import logging
17
+ import os
18
+ import queue
19
+ import shutil
20
+ import sys
21
+ import threading
22
+ import time
23
+ import traceback
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+ from threading import Lock
27
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
28
+ from urllib.parse import quote
29
+
30
+ from . import constants
31
+ from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
32
+ from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
33
+ from .constants import DEFAULT_REVISION, REPO_TYPES
34
+ from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
35
+ from .utils._cache_manager import _format_size
36
+ from .utils._runtime import is_xet_available
37
+ from .utils.sha import sha_fileobj
38
+
39
+
40
+ if TYPE_CHECKING:
41
+ from .hf_api import HfApi
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ WAITING_TIME_IF_NO_TASKS = 10 # seconds
46
+ MAX_NB_FILES_FETCH_UPLOAD_MODE = 100
47
+ COMMIT_SIZE_SCALE: List[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
48
+
49
+ UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos
50
+ UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload
51
+
52
+ # Repository limits (from https://huggingface.co/docs/hub/repositories-recommendations)
53
+ MAX_FILES_PER_REPO = 100_000 # Recommended maximum number of files per repository
54
+ MAX_FILES_PER_FOLDER = 10_000 # Recommended maximum number of files per folder
55
+ MAX_FILE_SIZE_GB = 50 # Hard limit for individual file size
56
+ RECOMMENDED_FILE_SIZE_GB = 20 # Recommended maximum for individual file size
57
+
58
+
59
+ def _validate_upload_limits(paths_list: List[LocalUploadFilePaths]) -> None:
60
+ """
61
+ Validate upload against repository limits and warn about potential issues.
62
+
63
+ Args:
64
+ paths_list: List of file paths to be uploaded
65
+
66
+ Warns about:
67
+ - Too many files in the repository (>100k)
68
+ - Too many entries (files or subdirectories) in a single folder (>10k)
69
+ - Files exceeding size limits (>20GB recommended, >50GB hard limit)
70
+ """
71
+ logger.info("Running validation checks on files to upload...")
72
+
73
+ # Check 1: Total file count
74
+ if len(paths_list) > MAX_FILES_PER_REPO:
75
+ logger.warning(
76
+ f"You are about to upload {len(paths_list):,} files. "
77
+ f"This exceeds the recommended limit of {MAX_FILES_PER_REPO:,} files per repository.\n"
78
+ f"Consider:\n"
79
+ f" - Splitting your data into multiple repositories\n"
80
+ f" - Using fewer, larger files (e.g., parquet files)\n"
81
+ f" - See: https://huggingface.co/docs/hub/repositories-recommendations"
82
+ )
83
+
84
+ # Check 2: Files and subdirectories per folder
85
+ # Track immediate children (files and subdirs) for each folder
86
+ from collections import defaultdict
87
+
88
+ entries_per_folder: Dict[str, Any] = defaultdict(lambda: {"files": 0, "subdirs": set()})
89
+
90
+ for paths in paths_list:
91
+ path = Path(paths.path_in_repo)
92
+ parts = path.parts
93
+
94
+ # Count this file in its immediate parent directory
95
+ parent = str(path.parent) if str(path.parent) != "." else "."
96
+ entries_per_folder[parent]["files"] += 1
97
+
98
+ # Track immediate subdirectories for each parent folder
99
+ # Walk through the path components to track parent-child relationships
100
+ for i, child in enumerate(parts[:-1]):
101
+ parent = "." if i == 0 else "/".join(parts[:i])
102
+ entries_per_folder[parent]["subdirs"].add(child)
103
+
104
+ # Check limits for each folder
105
+ for folder, data in entries_per_folder.items():
106
+ file_count = data["files"]
107
+ subdir_count = len(data["subdirs"])
108
+ total_entries = file_count + subdir_count
109
+
110
+ if total_entries > MAX_FILES_PER_FOLDER:
111
+ folder_display = "root" if folder == "." else folder
112
+ logger.warning(
113
+ f"Folder '{folder_display}' contains {total_entries:,} entries "
114
+ f"({file_count:,} files and {subdir_count:,} subdirectories). "
115
+ f"This exceeds the recommended {MAX_FILES_PER_FOLDER:,} entries per folder.\n"
116
+ "Consider reorganising into sub-folders."
117
+ )
118
+
119
+ # Check 3: File sizes
120
+ large_files = []
121
+ very_large_files = []
122
+
123
+ for paths in paths_list:
124
+ size = paths.file_path.stat().st_size
125
+ size_gb = size / 1_000_000_000 # Use decimal GB as per Hub limits
126
+
127
+ if size_gb > MAX_FILE_SIZE_GB:
128
+ very_large_files.append((paths.path_in_repo, size_gb))
129
+ elif size_gb > RECOMMENDED_FILE_SIZE_GB:
130
+ large_files.append((paths.path_in_repo, size_gb))
131
+
132
+ # Warn about very large files (>50GB)
133
+ if very_large_files:
134
+ files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in very_large_files[:5])
135
+ more_str = f"\n ... and {len(very_large_files) - 5} more files" if len(very_large_files) > 5 else ""
136
+ logger.warning(
137
+ f"Found {len(very_large_files)} files exceeding the {MAX_FILE_SIZE_GB}GB hard limit:\n"
138
+ f" - {files_str}{more_str}\n"
139
+ f"These files may fail to upload. Consider splitting them into smaller chunks."
140
+ )
141
+
142
+ # Warn about large files (>20GB)
143
+ if large_files:
144
+ files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in large_files[:5])
145
+ more_str = f"\n ... and {len(large_files) - 5} more files" if len(large_files) > 5 else ""
146
+ logger.warning(
147
+ f"Found {len(large_files)} files larger than {RECOMMENDED_FILE_SIZE_GB}GB (recommended limit):\n"
148
+ f" - {files_str}{more_str}\n"
149
+ f"Large files may slow down loading and processing."
150
+ )
151
+
152
+ logger.info("Validation checks complete.")
153
+
154
+
155
+ def upload_large_folder_internal(
156
+ api: "HfApi",
157
+ repo_id: str,
158
+ folder_path: Union[str, Path],
159
+ *,
160
+ repo_type: str, # Repo type is required!
161
+ revision: Optional[str] = None,
162
+ private: Optional[bool] = None,
163
+ allow_patterns: Optional[Union[List[str], str]] = None,
164
+ ignore_patterns: Optional[Union[List[str], str]] = None,
165
+ num_workers: Optional[int] = None,
166
+ print_report: bool = True,
167
+ print_report_every: int = 60,
168
+ ):
169
+ """Upload a large folder to the Hub in the most resilient way possible.
170
+
171
+ See [`HfApi.upload_large_folder`] for the full documentation.
172
+ """
173
+ # 1. Check args and setup
174
+ if repo_type is None:
175
+ raise ValueError(
176
+ "For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`."
177
+ " If you are using the CLI, pass it as `--repo-type=model`."
178
+ )
179
+ if repo_type not in REPO_TYPES:
180
+ raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
181
+ if revision is None:
182
+ revision = DEFAULT_REVISION
183
+
184
+ folder_path = Path(folder_path).expanduser().resolve()
185
+ if not folder_path.is_dir():
186
+ raise ValueError(f"Provided path: '{folder_path}' is not a directory")
187
+
188
+ if ignore_patterns is None:
189
+ ignore_patterns = []
190
+ elif isinstance(ignore_patterns, str):
191
+ ignore_patterns = [ignore_patterns]
192
+ ignore_patterns += DEFAULT_IGNORE_PATTERNS
193
+
194
+ if num_workers is None:
195
+ nb_cores = os.cpu_count() or 1
196
+ num_workers = max(nb_cores - 2, 2) # Use all but 2 cores, or at least 2 cores
197
+
198
+ # 2. Create repo if missing
199
+ repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
200
+ logger.info(f"Repo created: {repo_url}")
201
+ repo_id = repo_url.repo_id
202
+ # 2.1 Check if xet is enabled to set batch file upload size
203
+ is_xet_enabled = (
204
+ is_xet_available()
205
+ and api.repo_info(
206
+ repo_id=repo_id,
207
+ repo_type=repo_type,
208
+ revision=revision,
209
+ expand="xetEnabled",
210
+ ).xet_enabled
211
+ )
212
+ upload_batch_size = UPLOAD_BATCH_SIZE_XET if is_xet_enabled else UPLOAD_BATCH_SIZE_LFS
213
+
214
+ # 3. List files to upload
215
+ filtered_paths_list = filter_repo_objects(
216
+ (path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()),
217
+ allow_patterns=allow_patterns,
218
+ ignore_patterns=ignore_patterns,
219
+ )
220
+ paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
221
+ logger.info(f"Found {len(paths_list)} candidate files to upload")
222
+
223
+ # Validate upload against repository limits
224
+ _validate_upload_limits(paths_list)
225
+
226
+ logger.info("Starting upload...")
227
+
228
+ # Read metadata for each file
229
+ items = [
230
+ (paths, read_upload_metadata(folder_path, paths.path_in_repo))
231
+ for paths in tqdm(paths_list, desc="Recovering from metadata files")
232
+ ]
233
+
234
+ # 4. Start workers
235
+ status = LargeUploadStatus(items, upload_batch_size)
236
+ threads = [
237
+ threading.Thread(
238
+ target=_worker_job,
239
+ kwargs={
240
+ "status": status,
241
+ "api": api,
242
+ "repo_id": repo_id,
243
+ "repo_type": repo_type,
244
+ "revision": revision,
245
+ },
246
+ )
247
+ for _ in range(num_workers)
248
+ ]
249
+
250
+ for thread in threads:
251
+ thread.start()
252
+
253
+ # 5. Print regular reports
254
+ if print_report:
255
+ print("\n\n" + status.current_report())
256
+ last_report_ts = time.time()
257
+ while True:
258
+ time.sleep(1)
259
+ if time.time() - last_report_ts >= print_report_every:
260
+ if print_report:
261
+ _print_overwrite(status.current_report())
262
+ last_report_ts = time.time()
263
+ if status.is_done():
264
+ logging.info("Is done: exiting main loop")
265
+ break
266
+
267
+ for thread in threads:
268
+ thread.join()
269
+
270
+ logger.info(status.current_report())
271
+ logging.info("Upload is complete!")
272
+
273
+
274
+ ####################
275
+ # Logic to manage workers and synchronize tasks
276
+ ####################
277
+
278
+
279
+ class WorkerJob(enum.Enum):
280
+ SHA256 = enum.auto()
281
+ GET_UPLOAD_MODE = enum.auto()
282
+ PREUPLOAD_LFS = enum.auto()
283
+ COMMIT = enum.auto()
284
+ WAIT = enum.auto() # if no tasks are available but we don't want to exit
285
+
286
+
287
+ JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
288
+
289
+
290
+ class LargeUploadStatus:
291
+ """Contains information, queues and tasks for a large upload process."""
292
+
293
+ def __init__(self, items: List[JOB_ITEM_T], upload_batch_size: int = 1):
294
+ self.items = items
295
+ self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
296
+ self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
297
+ self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
298
+ self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
299
+ self.lock = Lock()
300
+
301
+ self.nb_workers_sha256: int = 0
302
+ self.nb_workers_get_upload_mode: int = 0
303
+ self.nb_workers_preupload_lfs: int = 0
304
+ self.upload_batch_size: int = upload_batch_size
305
+ self.nb_workers_commit: int = 0
306
+ self.nb_workers_waiting: int = 0
307
+ self.last_commit_attempt: Optional[float] = None
308
+
309
+ self._started_at = datetime.now()
310
+ self._chunk_idx: int = 1
311
+ self._chunk_lock: Lock = Lock()
312
+
313
+ # Setup queues
314
+ for item in self.items:
315
+ paths, metadata = item
316
+ if metadata.sha256 is None:
317
+ self.queue_sha256.put(item)
318
+ elif metadata.upload_mode is None:
319
+ self.queue_get_upload_mode.put(item)
320
+ elif metadata.upload_mode == "lfs" and not metadata.is_uploaded:
321
+ self.queue_preupload_lfs.put(item)
322
+ elif not metadata.is_committed:
323
+ self.queue_commit.put(item)
324
+ else:
325
+ logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)")
326
+
327
+ def target_chunk(self) -> int:
328
+ with self._chunk_lock:
329
+ return COMMIT_SIZE_SCALE[self._chunk_idx]
330
+
331
+ def update_chunk(self, success: bool, nb_items: int, duration: float) -> None:
332
+ with self._chunk_lock:
333
+ if not success:
334
+ logger.warning(f"Failed to commit {nb_items} files at once. Will retry with less files in next batch.")
335
+ self._chunk_idx -= 1
336
+ elif nb_items >= COMMIT_SIZE_SCALE[self._chunk_idx] and duration < 40:
337
+ logger.info(f"Successfully committed {nb_items} at once. Increasing the limit for next batch.")
338
+ self._chunk_idx += 1
339
+
340
+ self._chunk_idx = max(0, min(self._chunk_idx, len(COMMIT_SIZE_SCALE) - 1))
341
+
342
+ def current_report(self) -> str:
343
+ """Generate a report of the current status of the large upload."""
344
+ nb_hashed = 0
345
+ size_hashed = 0
346
+ nb_preuploaded = 0
347
+ nb_lfs = 0
348
+ nb_lfs_unsure = 0
349
+ size_preuploaded = 0
350
+ nb_committed = 0
351
+ size_committed = 0
352
+ total_size = 0
353
+ ignored_files = 0
354
+ total_files = 0
355
+
356
+ with self.lock:
357
+ for _, metadata in self.items:
358
+ if metadata.should_ignore:
359
+ ignored_files += 1
360
+ continue
361
+ total_size += metadata.size
362
+ total_files += 1
363
+ if metadata.sha256 is not None:
364
+ nb_hashed += 1
365
+ size_hashed += metadata.size
366
+ if metadata.upload_mode == "lfs":
367
+ nb_lfs += 1
368
+ if metadata.upload_mode is None:
369
+ nb_lfs_unsure += 1
370
+ if metadata.is_uploaded:
371
+ nb_preuploaded += 1
372
+ size_preuploaded += metadata.size
373
+ if metadata.is_committed:
374
+ nb_committed += 1
375
+ size_committed += metadata.size
376
+ total_size_str = _format_size(total_size)
377
+
378
+ now = datetime.now()
379
+ now_str = now.strftime("%Y-%m-%d %H:%M:%S")
380
+ elapsed = now - self._started_at
381
+ elapsed_str = str(elapsed).split(".")[0] # remove milliseconds
382
+
383
+ message = "\n" + "-" * 10
384
+ message += f" {now_str} ({elapsed_str}) "
385
+ message += "-" * 10 + "\n"
386
+
387
+ message += "Files: "
388
+ message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | "
389
+ message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})"
390
+ if nb_lfs_unsure > 0:
391
+ message += f" (+{nb_lfs_unsure} unsure)"
392
+ message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})"
393
+ message += f" | ignored: {ignored_files}\n"
394
+
395
+ message += "Workers: "
396
+ message += f"hashing: {self.nb_workers_sha256} | "
397
+ message += f"get upload mode: {self.nb_workers_get_upload_mode} | "
398
+ message += f"pre-uploading: {self.nb_workers_preupload_lfs} | "
399
+ message += f"committing: {self.nb_workers_commit} | "
400
+ message += f"waiting: {self.nb_workers_waiting}\n"
401
+ message += "-" * 51
402
+
403
+ return message
404
+
405
+ def is_done(self) -> bool:
406
+ with self.lock:
407
+ return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items)
408
+
409
+
410
+ def _worker_job(
411
+ status: LargeUploadStatus,
412
+ api: "HfApi",
413
+ repo_id: str,
414
+ repo_type: str,
415
+ revision: str,
416
+ ):
417
+ """
418
+ Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded
419
+ and committed. If no tasks are available, the worker will wait for 10 seconds before checking again.
420
+
421
+ If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up.
422
+
423
+ Read `upload_large_folder` docstring for more information on how tasks are prioritized.
424
+ """
425
+ while True:
426
+ next_job: Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]] = None
427
+
428
+ # Determine next task
429
+ next_job = _determine_next_job(status)
430
+ if next_job is None:
431
+ return
432
+ job, items = next_job
433
+
434
+ # Perform task
435
+ if job == WorkerJob.SHA256:
436
+ item = items[0] # single item
437
+ try:
438
+ _compute_sha256(item)
439
+ status.queue_get_upload_mode.put(item)
440
+ except KeyboardInterrupt:
441
+ raise
442
+ except Exception as e:
443
+ logger.error(f"Failed to compute sha256: {e}")
444
+ traceback.format_exc()
445
+ status.queue_sha256.put(item)
446
+
447
+ with status.lock:
448
+ status.nb_workers_sha256 -= 1
449
+
450
+ elif job == WorkerJob.GET_UPLOAD_MODE:
451
+ try:
452
+ _get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
453
+ except KeyboardInterrupt:
454
+ raise
455
+ except Exception as e:
456
+ logger.error(f"Failed to get upload mode: {e}")
457
+ traceback.format_exc()
458
+
459
+ # Items are either:
460
+ # - dropped (if should_ignore)
461
+ # - put in LFS queue (if LFS)
462
+ # - put in commit queue (if regular)
463
+ # - or put back (if error occurred).
464
+ for item in items:
465
+ _, metadata = item
466
+ if metadata.should_ignore:
467
+ continue
468
+ if metadata.upload_mode == "lfs":
469
+ status.queue_preupload_lfs.put(item)
470
+ elif metadata.upload_mode == "regular":
471
+ status.queue_commit.put(item)
472
+ else:
473
+ status.queue_get_upload_mode.put(item)
474
+
475
+ with status.lock:
476
+ status.nb_workers_get_upload_mode -= 1
477
+
478
+ elif job == WorkerJob.PREUPLOAD_LFS:
479
+ try:
480
+ _preupload_lfs(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
481
+ for item in items:
482
+ status.queue_commit.put(item)
483
+ except KeyboardInterrupt:
484
+ raise
485
+ except Exception as e:
486
+ logger.error(f"Failed to preupload LFS: {e}")
487
+ traceback.format_exc()
488
+ for item in items:
489
+ status.queue_preupload_lfs.put(item)
490
+
491
+ with status.lock:
492
+ status.nb_workers_preupload_lfs -= 1
493
+
494
+ elif job == WorkerJob.COMMIT:
495
+ start_ts = time.time()
496
+ success = True
497
+ try:
498
+ _commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
499
+ except KeyboardInterrupt:
500
+ raise
501
+ except Exception as e:
502
+ logger.error(f"Failed to commit: {e}")
503
+ traceback.format_exc()
504
+ for item in items:
505
+ status.queue_commit.put(item)
506
+ success = False
507
+ duration = time.time() - start_ts
508
+ status.update_chunk(success, len(items), duration)
509
+ with status.lock:
510
+ status.last_commit_attempt = time.time()
511
+ status.nb_workers_commit -= 1
512
+
513
+ elif job == WorkerJob.WAIT:
514
+ time.sleep(WAITING_TIME_IF_NO_TASKS)
515
+ with status.lock:
516
+ status.nb_workers_waiting -= 1
517
+
518
+
519
+ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]]:
520
+ with status.lock:
521
+ # 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
522
+ if (
523
+ status.nb_workers_commit == 0
524
+ and status.queue_commit.qsize() > 0
525
+ and status.last_commit_attempt is not None
526
+ and time.time() - status.last_commit_attempt > 5 * 60
527
+ ):
528
+ status.nb_workers_commit += 1
529
+ logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
530
+ return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
531
+
532
+ # 2. Commit if at least 100 files are ready to commit
533
+ elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
534
+ status.nb_workers_commit += 1
535
+ logger.debug("Job: commit (>100 files ready)")
536
+ return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
537
+
538
+ # 3. Get upload mode if at least 100 files
539
+ elif status.queue_get_upload_mode.qsize() >= MAX_NB_FILES_FETCH_UPLOAD_MODE:
540
+ status.nb_workers_get_upload_mode += 1
541
+ logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)")
542
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
543
+
544
+ # 4. Preupload LFS file if at least `status.upload_batch_size` files and no worker is preuploading LFS
545
+ elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and status.nb_workers_preupload_lfs == 0:
546
+ status.nb_workers_preupload_lfs += 1
547
+ logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
548
+ return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
549
+
550
+ # 5. Compute sha256 if at least 1 file and no worker is computing sha256
551
+ elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
552
+ status.nb_workers_sha256 += 1
553
+ logger.debug("Job: sha256 (no other worker computing sha256)")
554
+ return (WorkerJob.SHA256, _get_one(status.queue_sha256))
555
+
556
+ # 6. Get upload mode if at least 1 file and no worker is getting upload mode
557
+ elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
558
+ status.nb_workers_get_upload_mode += 1
559
+ logger.debug("Job: get upload mode (no other worker getting upload mode)")
560
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
561
+
562
+ # 7. Preupload LFS file if at least `status.upload_batch_size` files
563
+ # Skip if hf_transfer is enabled and there is already a worker preuploading LFS
564
+ elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and (
565
+ status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
566
+ ):
567
+ status.nb_workers_preupload_lfs += 1
568
+ logger.debug("Job: preupload LFS")
569
+ return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
570
+
571
+ # 8. Compute sha256 if at least 1 file
572
+ elif status.queue_sha256.qsize() > 0:
573
+ status.nb_workers_sha256 += 1
574
+ logger.debug("Job: sha256")
575
+ return (WorkerJob.SHA256, _get_one(status.queue_sha256))
576
+
577
+ # 9. Get upload mode if at least 1 file
578
+ elif status.queue_get_upload_mode.qsize() > 0:
579
+ status.nb_workers_get_upload_mode += 1
580
+ logger.debug("Job: get upload mode")
581
+ return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
582
+
583
+ # 10. Preupload LFS file if at least 1 file
584
+ elif status.queue_preupload_lfs.qsize() > 0:
585
+ status.nb_workers_preupload_lfs += 1
586
+ logger.debug("Job: preupload LFS")
587
+ return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
588
+
589
+ # 11. Commit if at least 1 file and 1 min since last commit attempt
590
+ elif (
591
+ status.nb_workers_commit == 0
592
+ and status.queue_commit.qsize() > 0
593
+ and status.last_commit_attempt is not None
594
+ and time.time() - status.last_commit_attempt > 1 * 60
595
+ ):
596
+ status.nb_workers_commit += 1
597
+ logger.debug("Job: commit (1 min since last commit attempt)")
598
+ return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
599
+
600
+ # 12. Commit if at least 1 file all other queues are empty and all workers are waiting
601
+ # e.g. when it's the last commit
602
+ elif (
603
+ status.nb_workers_commit == 0
604
+ and status.queue_commit.qsize() > 0
605
+ and status.queue_sha256.qsize() == 0
606
+ and status.queue_get_upload_mode.qsize() == 0
607
+ and status.queue_preupload_lfs.qsize() == 0
608
+ and status.nb_workers_sha256 == 0
609
+ and status.nb_workers_get_upload_mode == 0
610
+ and status.nb_workers_preupload_lfs == 0
611
+ ):
612
+ status.nb_workers_commit += 1
613
+ logger.debug("Job: commit")
614
+ return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
615
+
616
+ # 13. If all queues are empty, exit
617
+ elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
618
+ logger.info("All files have been processed! Exiting worker.")
619
+ return None
620
+
621
+ # 14. If no task is available, wait
622
+ else:
623
+ status.nb_workers_waiting += 1
624
+ logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
625
+ return (WorkerJob.WAIT, [])
626
+
627
+
628
+ ####################
629
+ # Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit)
630
+ ####################
631
+
632
+
633
+ def _compute_sha256(item: JOB_ITEM_T) -> None:
634
+ """Compute sha256 of a file and save it in metadata."""
635
+ paths, metadata = item
636
+ if metadata.sha256 is None:
637
+ with paths.file_path.open("rb") as f:
638
+ metadata.sha256 = sha_fileobj(f).hex()
639
+ metadata.save(paths)
640
+
641
+
642
+ def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
643
+ """Get upload mode for each file and update metadata.
644
+
645
+ Also receive info if the file should be ignored.
646
+ """
647
+ additions = [_build_hacky_operation(item) for item in items]
648
+ _fetch_upload_modes(
649
+ additions=additions,
650
+ repo_type=repo_type,
651
+ repo_id=repo_id,
652
+ headers=api._build_hf_headers(),
653
+ revision=quote(revision, safe=""),
654
+ endpoint=api.endpoint,
655
+ )
656
+ for item, addition in zip(items, additions):
657
+ paths, metadata = item
658
+ metadata.upload_mode = addition._upload_mode
659
+ metadata.should_ignore = addition._should_ignore
660
+ metadata.remote_oid = addition._remote_oid
661
+ metadata.save(paths)
662
+
663
+
664
+ def _preupload_lfs(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
665
+ """Preupload LFS files and update metadata."""
666
+ additions = [_build_hacky_operation(item) for item in items]
667
+ api.preupload_lfs_files(
668
+ repo_id=repo_id,
669
+ repo_type=repo_type,
670
+ revision=revision,
671
+ additions=additions,
672
+ )
673
+
674
+ for paths, metadata in items:
675
+ metadata.is_uploaded = True
676
+ metadata.save(paths)
677
+
678
+
679
+ def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
680
+ """Commit files to the repo."""
681
+ additions = [_build_hacky_operation(item) for item in items]
682
+ api.create_commit(
683
+ repo_id=repo_id,
684
+ repo_type=repo_type,
685
+ revision=revision,
686
+ operations=additions,
687
+ commit_message="Add files using upload-large-folder tool",
688
+ )
689
+ for paths, metadata in items:
690
+ metadata.is_committed = True
691
+ metadata.save(paths)
692
+
693
+
694
+ ####################
695
+ # Hacks with CommitOperationAdd to bypass checks/sha256 calculation
696
+ ####################
697
+
698
+
699
+ class HackyCommitOperationAdd(CommitOperationAdd):
700
+ def __post_init__(self) -> None:
701
+ if isinstance(self.path_or_fileobj, Path):
702
+ self.path_or_fileobj = str(self.path_or_fileobj)
703
+
704
+
705
+ def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
706
+ paths, metadata = item
707
+ operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path)
708
+ with paths.file_path.open("rb") as file:
709
+ sample = file.peek(512)[:512]
710
+ if metadata.sha256 is None:
711
+ raise ValueError("sha256 must have been computed by now!")
712
+ operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
713
+ operation._upload_mode = metadata.upload_mode # type: ignore[assignment]
714
+ operation._should_ignore = metadata.should_ignore
715
+ operation._remote_oid = metadata.remote_oid
716
+ return operation
717
+
718
+
719
+ ####################
720
+ # Misc helpers
721
+ ####################
722
+
723
+
724
+ def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
725
+ return [queue.get()]
726
+
727
+
728
+ def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
729
+ return [queue.get() for _ in range(min(queue.qsize(), n))]
730
+
731
+
732
+ def _print_overwrite(report: str) -> None:
733
+ """Print a report, overwriting the previous lines.
734
+
735
+ Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout`
736
+ to print the report.
737
+
738
+ Note: works well only if no other process is writing to `sys.stdout`!
739
+ """
740
+ report += "\n"
741
+ # Get terminal width
742
+ terminal_width = shutil.get_terminal_size().columns
743
+
744
+ # Count number of lines that should be cleared
745
+ nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines())
746
+
747
+ # Clear previous lines based on the number of lines in the report
748
+ for _ in range(nb_lines):
749
+ sys.stdout.write("\r\033[K") # Clear line
750
+ sys.stdout.write("\033[F") # Move cursor up one line
751
+
752
+ # Print the new report, filling remaining space with whitespace
753
+ sys.stdout.write(report)
754
+ sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1])))
755
+ sys.stdout.flush()
venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_payload.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Contains data structures to parse the webhooks payload."""
16
+
17
+ from typing import List, Literal, Optional
18
+
19
+ from .utils import is_pydantic_available
20
+
21
+
22
+ if is_pydantic_available():
23
+ from pydantic import BaseModel
24
+ else:
25
+ # Define a dummy BaseModel to avoid import errors when pydantic is not installed
26
+ # Import error will be raised when trying to use the class
27
+
28
+ class BaseModel: # type: ignore [no-redef]
29
+ def __init__(self, *args, **kwargs) -> None:
30
+ raise ImportError(
31
+ "You must have `pydantic` installed to use `WebhookPayload`. This is an optional dependency that"
32
+ " should be installed separately. Please run `pip install --upgrade pydantic` and retry."
33
+ )
34
+
35
+
36
+ # This is an adaptation of the ReportV3 interface implemented in moon-landing. V0, V1 and V2 have been ignored as they
37
+ # are not in used anymore. To keep in sync when format is updated in
38
+ # https://github.com/huggingface/moon-landing/blob/main/server/lib/HFWebhooks.ts (internal link).
39
+
40
+
41
+ WebhookEvent_T = Literal[
42
+ "create",
43
+ "delete",
44
+ "move",
45
+ "update",
46
+ ]
47
+ RepoChangeEvent_T = Literal[
48
+ "add",
49
+ "move",
50
+ "remove",
51
+ "update",
52
+ ]
53
+ RepoType_T = Literal[
54
+ "dataset",
55
+ "model",
56
+ "space",
57
+ ]
58
+ DiscussionStatus_T = Literal[
59
+ "closed",
60
+ "draft",
61
+ "open",
62
+ "merged",
63
+ ]
64
+ SupportedWebhookVersion = Literal[3]
65
+
66
+
67
+ class ObjectId(BaseModel):
68
+ id: str
69
+
70
+
71
+ class WebhookPayloadUrl(BaseModel):
72
+ web: str
73
+ api: Optional[str] = None
74
+
75
+
76
+ class WebhookPayloadMovedTo(BaseModel):
77
+ name: str
78
+ owner: ObjectId
79
+
80
+
81
+ class WebhookPayloadWebhook(ObjectId):
82
+ version: SupportedWebhookVersion
83
+
84
+
85
+ class WebhookPayloadEvent(BaseModel):
86
+ action: WebhookEvent_T
87
+ scope: str
88
+
89
+
90
+ class WebhookPayloadDiscussionChanges(BaseModel):
91
+ base: str
92
+ mergeCommitId: Optional[str] = None
93
+
94
+
95
+ class WebhookPayloadComment(ObjectId):
96
+ author: ObjectId
97
+ hidden: bool
98
+ content: Optional[str] = None
99
+ url: WebhookPayloadUrl
100
+
101
+
102
+ class WebhookPayloadDiscussion(ObjectId):
103
+ num: int
104
+ author: ObjectId
105
+ url: WebhookPayloadUrl
106
+ title: str
107
+ isPullRequest: bool
108
+ status: DiscussionStatus_T
109
+ changes: Optional[WebhookPayloadDiscussionChanges] = None
110
+ pinned: Optional[bool] = None
111
+
112
+
113
+ class WebhookPayloadRepo(ObjectId):
114
+ owner: ObjectId
115
+ head_sha: Optional[str] = None
116
+ name: str
117
+ private: bool
118
+ subdomain: Optional[str] = None
119
+ tags: Optional[List[str]] = None
120
+ type: Literal["dataset", "model", "space"]
121
+ url: WebhookPayloadUrl
122
+
123
+
124
+ class WebhookPayloadUpdatedRef(BaseModel):
125
+ ref: str
126
+ oldSha: Optional[str] = None
127
+ newSha: Optional[str] = None
128
+
129
+
130
+ class WebhookPayload(BaseModel):
131
+ event: WebhookPayloadEvent
132
+ repo: WebhookPayloadRepo
133
+ discussion: Optional[WebhookPayloadDiscussion] = None
134
+ comment: Optional[WebhookPayloadComment] = None
135
+ webhook: WebhookPayloadWebhook
136
+ movedTo: Optional[WebhookPayloadMovedTo] = None
137
+ updatedRefs: Optional[List[WebhookPayloadUpdatedRef]] = None
venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present, the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Contains `WebhooksServer` and `webhook_endpoint` to create a webhook server easily."""
16
+
17
+ import atexit
18
+ import inspect
19
+ import os
20
+ from functools import wraps
21
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
22
+
23
+ from .utils import experimental, is_fastapi_available, is_gradio_available
24
+
25
+
26
+ if TYPE_CHECKING:
27
+ import gradio as gr
28
+ from fastapi import Request
29
+
30
+ if is_fastapi_available():
31
+ from fastapi import FastAPI, Request
32
+ from fastapi.responses import JSONResponse
33
+ else:
34
+ # Will fail at runtime if FastAPI is not available
35
+ FastAPI = Request = JSONResponse = None # type: ignore
36
+
37
+
38
+ _global_app: Optional["WebhooksServer"] = None
39
+ _is_local = os.environ.get("SPACE_ID") is None
40
+
41
+
42
+ @experimental
43
+ class WebhooksServer:
44
+ """
45
+ The [`WebhooksServer`] class lets you create an instance of a Gradio app that can receive Huggingface webhooks.
46
+ These webhooks can be registered using the [`~WebhooksServer.add_webhook`] decorator. Webhook endpoints are added to
47
+ the app as a POST endpoint to the FastAPI router. Once all the webhooks are registered, the `launch` method has to be
48
+ called to start the app.
49
+
50
+ It is recommended to accept [`WebhookPayload`] as the first argument of the webhook function. It is a Pydantic
51
+ model that contains all the information about the webhook event. The data will be parsed automatically for you.
52
+
53
+ Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
54
+ WebhooksServer and deploy it on a Space.
55
+
56
+ > [!WARNING]
57
+ > `WebhooksServer` is experimental. Its API is subject to change in the future.
58
+
59
+ > [!WARNING]
60
+ > You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`).
61
+
62
+ Args:
63
+ ui (`gradio.Blocks`, optional):
64
+ A Gradio UI instance to be used as the Space landing page. If `None`, a UI displaying instructions
65
+ about the configured webhooks is created.
66
+ webhook_secret (`str`, optional):
67
+ A secret key to verify incoming webhook requests. You can set this value to any secret you want as long as
68
+ you also configure it in your [webhooks settings panel](https://huggingface.co/settings/webhooks). You
69
+ can also set this value as the `WEBHOOK_SECRET` environment variable. If no secret is provided, the
70
+ webhook endpoints are opened without any security.
71
+
72
+ Example:
73
+
74
+ ```python
75
+ import gradio as gr
76
+ from huggingface_hub import WebhooksServer, WebhookPayload
77
+
78
+ with gr.Blocks() as ui:
79
+ ...
80
+
81
+ app = WebhooksServer(ui=ui, webhook_secret="my_secret_key")
82
+
83
+ @app.add_webhook("/say_hello")
84
+ async def hello(payload: WebhookPayload):
85
+ return {"message": "hello"}
86
+
87
+ app.launch()
88
+ ```
89
+ """
90
+
91
+ def __new__(cls, *args, **kwargs) -> "WebhooksServer":
92
+ if not is_gradio_available():
93
+ raise ImportError(
94
+ "You must have `gradio` installed to use `WebhooksServer`. Please run `pip install --upgrade gradio`"
95
+ " first."
96
+ )
97
+ if not is_fastapi_available():
98
+ raise ImportError(
99
+ "You must have `fastapi` installed to use `WebhooksServer`. Please run `pip install --upgrade fastapi`"
100
+ " first."
101
+ )
102
+ return super().__new__(cls)
103
+
104
+ def __init__(
105
+ self,
106
+ ui: Optional["gr.Blocks"] = None,
107
+ webhook_secret: Optional[str] = None,
108
+ ) -> None:
109
+ self._ui = ui
110
+
111
+ self.webhook_secret = webhook_secret or os.getenv("WEBHOOK_SECRET")
112
+ self.registered_webhooks: Dict[str, Callable] = {}
113
+ _warn_on_empty_secret(self.webhook_secret)
114
+
115
+ def add_webhook(self, path: Optional[str] = None) -> Callable:
116
+ """
117
+ Decorator to add a webhook to the [`WebhooksServer`] server.
118
+
119
+ Args:
120
+ path (`str`, optional):
121
+ The URL path to register the webhook function. If not provided, the function name will be used as the
122
+ path. In any case, all webhooks are registered under `/webhooks`.
123
+
124
+ Raises:
125
+ ValueError: If the provided path is already registered as a webhook.
126
+
127
+ Example:
128
+ ```python
129
+ from huggingface_hub import WebhooksServer, WebhookPayload
130
+
131
+ app = WebhooksServer()
132
+
133
+ @app.add_webhook
134
+ async def trigger_training(payload: WebhookPayload):
135
+ if payload.repo.type == "dataset" and payload.event.action == "update":
136
+ # Trigger a training job if a dataset is updated
137
+ ...
138
+
139
+ app.launch()
140
+ ```
141
+ """
142
+ # Usage: directly as decorator. Example: `@app.add_webhook`
143
+ if callable(path):
144
+ # If path is a function, it means it was used as a decorator without arguments
145
+ return self.add_webhook()(path)
146
+
147
+ # Usage: provide a path. Example: `@app.add_webhook(...)`
148
+ @wraps(FastAPI.post)
149
+ def _inner_post(*args, **kwargs):
150
+ func = args[0]
151
+ abs_path = f"/webhooks/{(path or func.__name__).strip('/')}"
152
+ if abs_path in self.registered_webhooks:
153
+ raise ValueError(f"Webhook {abs_path} already exists.")
154
+ self.registered_webhooks[abs_path] = func
155
+
156
+ return _inner_post
157
+
158
+ def launch(self, prevent_thread_lock: bool = False, **launch_kwargs: Any) -> None:
159
+ """Launch the Gradio app and register webhooks to the underlying FastAPI server.
160
+
161
+ Input parameters are forwarded to Gradio when launching the app.
162
+ """
163
+ ui = self._ui or self._get_default_ui()
164
+
165
+ # Start Gradio App
166
+ # - as non-blocking so that webhooks can be added afterwards
167
+ # - as shared if launch locally (to debug webhooks)
168
+ launch_kwargs.setdefault("share", _is_local)
169
+ self.fastapi_app, _, _ = ui.launch(prevent_thread_lock=True, **launch_kwargs)
170
+
171
+ # Register webhooks to FastAPI app
172
+ for path, func in self.registered_webhooks.items():
173
+ # Add secret check if required
174
+ if self.webhook_secret is not None:
175
+ func = _wrap_webhook_to_check_secret(func, webhook_secret=self.webhook_secret)
176
+
177
+ # Add route to FastAPI app
178
+ self.fastapi_app.post(path)(func)
179
+
180
+ # Print instructions and block main thread
181
+ space_host = os.environ.get("SPACE_HOST")
182
+ url = "https://" + space_host if space_host is not None else (ui.share_url or ui.local_url)
183
+ if url is None:
184
+ raise ValueError("Cannot find the URL of the app. Please provide a valid `ui` or update `gradio` version.")
185
+ url = url.strip("/")
186
+ message = "\nWebhooks are correctly setup and ready to use:"
187
+ message += "\n" + "\n".join(f" - POST {url}{webhook}" for webhook in self.registered_webhooks)
188
+ message += "\nGo to https://huggingface.co/settings/webhooks to setup your webhooks."
189
+ print(message)
190
+
191
+ if not prevent_thread_lock:
192
+ ui.block_thread()
193
+
194
+ def _get_default_ui(self) -> "gr.Blocks":
195
+ """Default UI if not provided (lists webhooks and provides basic instructions)."""
196
+ import gradio as gr
197
+
198
+ with gr.Blocks() as ui:
199
+ gr.Markdown("# This is an app to process 🤗 Webhooks")
200
+ gr.Markdown(
201
+ "Webhooks are a foundation for MLOps-related features. They allow you to listen for new changes on"
202
+ " specific repos or to all repos belonging to particular set of users/organizations (not just your"
203
+ " repos, but any repo). Check out this [guide](https://huggingface.co/docs/hub/webhooks) to get to"
204
+ " know more about webhooks on the Huggingface Hub."
205
+ )
206
+ gr.Markdown(
207
+ f"{len(self.registered_webhooks)} webhook(s) are registered:"
208
+ + "\n\n"
209
+ + "\n ".join(
210
+ f"- [{webhook_path}]({_get_webhook_doc_url(webhook.__name__, webhook_path)})"
211
+ for webhook_path, webhook in self.registered_webhooks.items()
212
+ )
213
+ )
214
+ gr.Markdown(
215
+ "Go to https://huggingface.co/settings/webhooks to setup your webhooks."
216
+ + "\nYou app is running locally. Please look at the logs to check the full URL you need to set."
217
+ if _is_local
218
+ else (
219
+ "\nThis app is running on a Space. You can find the corresponding URL in the options menu"
220
+ " (top-right) > 'Embed the Space'. The URL looks like 'https://{username}-{repo_name}.hf.space'."
221
+ )
222
+ )
223
+ return ui
224
+
225
+
226
+ @experimental
227
+ def webhook_endpoint(path: Optional[str] = None) -> Callable:
228
+ """Decorator to start a [`WebhooksServer`] and register the decorated function as a webhook endpoint.
229
+
230
+ This is a helper to get started quickly. If you need more flexibility (custom landing page or webhook secret),
231
+ you can use [`WebhooksServer`] directly. You can register multiple webhook endpoints (to the same server) by using
232
+ this decorator multiple times.
233
+
234
+ Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
235
+ server and deploy it on a Space.
236
+
237
+ > [!WARNING]
238
+ > `webhook_endpoint` is experimental. Its API is subject to change in the future.
239
+
240
+ > [!WARNING]
241
+ > You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`).
242
+
243
+ Args:
244
+ path (`str`, optional):
245
+ The URL path to register the webhook function. If not provided, the function name will be used as the path.
246
+ In any case, all webhooks are registered under `/webhooks`.
247
+
248
+ Examples:
249
+ The default usage is to register a function as a webhook endpoint. The function name will be used as the path.
250
+ The server will be started automatically at exit (i.e. at the end of the script).
251
+
252
+ ```python
253
+ from huggingface_hub import webhook_endpoint, WebhookPayload
254
+
255
+ @webhook_endpoint
256
+ async def trigger_training(payload: WebhookPayload):
257
+ if payload.repo.type == "dataset" and payload.event.action == "update":
258
+ # Trigger a training job if a dataset is updated
259
+ ...
260
+
261
+ # Server is automatically started at the end of the script.
262
+ ```
263
+
264
+ Advanced usage: register a function as a webhook endpoint and start the server manually. This is useful if you
265
+ are running it in a notebook.
266
+
267
+ ```python
268
+ from huggingface_hub import webhook_endpoint, WebhookPayload
269
+
270
+ @webhook_endpoint
271
+ async def trigger_training(payload: WebhookPayload):
272
+ if payload.repo.type == "dataset" and payload.event.action == "update":
273
+ # Trigger a training job if a dataset is updated
274
+ ...
275
+
276
+ # Start the server manually
277
+ trigger_training.launch()
278
+ ```
279
+ """
280
+ if callable(path):
281
+ # If path is a function, it means it was used as a decorator without arguments
282
+ return webhook_endpoint()(path)
283
+
284
+ @wraps(WebhooksServer.add_webhook)
285
+ def _inner(func: Callable) -> Callable:
286
+ app = _get_global_app()
287
+ app.add_webhook(path)(func)
288
+ if len(app.registered_webhooks) == 1:
289
+ # Register `app.launch` to run at exit (only once)
290
+ atexit.register(app.launch)
291
+
292
+ @wraps(app.launch)
293
+ def _launch_now():
294
+ # Run the app directly (without waiting atexit)
295
+ atexit.unregister(app.launch)
296
+ app.launch()
297
+
298
+ func.launch = _launch_now # type: ignore
299
+ return func
300
+
301
+ return _inner
302
+
303
+
304
+ def _get_global_app() -> WebhooksServer:
305
+ global _global_app
306
+ if _global_app is None:
307
+ _global_app = WebhooksServer()
308
+ return _global_app
309
+
310
+
311
+ def _warn_on_empty_secret(webhook_secret: Optional[str]) -> None:
312
+ if webhook_secret is None:
313
+ print("Webhook secret is not defined. This means your webhook endpoints will be open to everyone.")
314
+ print(
315
+ "To add a secret, set `WEBHOOK_SECRET` as environment variable or pass it at initialization: "
316
+ "\n\t`app = WebhooksServer(webhook_secret='my_secret', ...)`"
317
+ )
318
+ print(
319
+ "For more details about webhook secrets, please refer to"
320
+ " https://huggingface.co/docs/hub/webhooks#webhook-secret."
321
+ )
322
+ else:
323
+ print("Webhook secret is correctly defined.")
324
+
325
+
326
+ def _get_webhook_doc_url(webhook_name: str, webhook_path: str) -> str:
327
+ """Returns the anchor to a given webhook in the docs (experimental)"""
328
+ return "/docs#/default/" + webhook_name + webhook_path.replace("/", "_") + "_post"
329
+
330
+
331
+ def _wrap_webhook_to_check_secret(func: Callable, webhook_secret: str) -> Callable:
332
+ """Wraps a webhook function to check the webhook secret before calling the function.
333
+
334
+ This is a hacky way to add the `request` parameter to the function signature. Since FastAPI based itself on route
335
+ parameters to inject the values to the function, we need to hack the function signature to retrieve the `Request`
336
+ object (and hence the headers). A far cleaner solution would be to use a middleware. However, since
337
+ `fastapi==0.90.1`, a middleware cannot be added once the app has started. And since the FastAPI app is started by
338
+ Gradio internals (and not by us), we cannot add a middleware.
339
+
340
+ This method is called only when a secret has been defined by the user. If a request is sent without the
341
+ "x-webhook-secret", the function will return a 401 error (unauthorized). If the header is sent but is incorrect,
342
+ the function will return a 403 error (forbidden).
343
+
344
+ Inspired by https://stackoverflow.com/a/33112180.
345
+ """
346
+ initial_sig = inspect.signature(func)
347
+
348
+ @wraps(func)
349
+ async def _protected_func(request: Request, **kwargs):
350
+ request_secret = request.headers.get("x-webhook-secret")
351
+ if request_secret is None:
352
+ return JSONResponse({"error": "x-webhook-secret header not set."}, status_code=401)
353
+ if request_secret != webhook_secret:
354
+ return JSONResponse({"error": "Invalid webhook secret."}, status_code=403)
355
+
356
+ # Inject `request` in kwargs if required
357
+ if "request" in initial_sig.parameters:
358
+ kwargs["request"] = request
359
+
360
+ # Handle both sync and async routes
361
+ if inspect.iscoroutinefunction(func):
362
+ return await func(**kwargs)
363
+ else:
364
+ return func(**kwargs)
365
+
366
+ # Update signature to include request
367
+ if "request" not in initial_sig.parameters:
368
+ _protected_func.__signature__ = initial_sig.replace( # type: ignore
369
+ parameters=(
370
+ inspect.Parameter(name="request", kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Request),
371
+ )
372
+ + tuple(initial_sig.parameters.values())
373
+ )
374
+
375
+ # Return protected route
376
+ return _protected_func
venv/lib/python3.13/site-packages/huggingface_hub/community.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data structures to interact with Discussions and Pull Requests on the Hub.
3
+
4
+ See [the Discussions and Pull Requests guide](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)
5
+ for more information on Pull Requests, Discussions, and the community tab.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+ from typing import List, Literal, Optional, TypedDict, Union
11
+
12
+ from . import constants
13
+ from .utils import parse_datetime
14
+
15
+
16
+ DiscussionStatus = Literal["open", "closed", "merged", "draft"]
17
+
18
+
19
+ @dataclass
20
+ class Discussion:
21
+ """
22
+ A Discussion or Pull Request on the Hub.
23
+
24
+ This dataclass is not intended to be instantiated directly.
25
+
26
+ Attributes:
27
+ title (`str`):
28
+ The title of the Discussion / Pull Request
29
+ status (`str`):
30
+ The status of the Discussion / Pull Request.
31
+ It must be one of:
32
+ * `"open"`
33
+ * `"closed"`
34
+ * `"merged"` (only for Pull Requests )
35
+ * `"draft"` (only for Pull Requests )
36
+ num (`int`):
37
+ The number of the Discussion / Pull Request.
38
+ repo_id (`str`):
39
+ The id (`"{namespace}/{repo_name}"`) of the repo on which
40
+ the Discussion / Pull Request was open.
41
+ repo_type (`str`):
42
+ The type of the repo on which the Discussion / Pull Request was open.
43
+ Possible values are: `"model"`, `"dataset"`, `"space"`.
44
+ author (`str`):
45
+ The username of the Discussion / Pull Request author.
46
+ Can be `"deleted"` if the user has been deleted since.
47
+ is_pull_request (`bool`):
48
+ Whether or not this is a Pull Request.
49
+ created_at (`datetime`):
50
+ The `datetime` of creation of the Discussion / Pull Request.
51
+ endpoint (`str`):
52
+ Endpoint of the Hub. Default is https://huggingface.co.
53
+ git_reference (`str`, *optional*):
54
+ (property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
55
+ url (`str`):
56
+ (property) URL of the discussion on the Hub.
57
+ """
58
+
59
+ title: str
60
+ status: DiscussionStatus
61
+ num: int
62
+ repo_id: str
63
+ repo_type: str
64
+ author: str
65
+ is_pull_request: bool
66
+ created_at: datetime
67
+ endpoint: str
68
+
69
+ @property
70
+ def git_reference(self) -> Optional[str]:
71
+ """
72
+ If this is a Pull Request , returns the git reference to which changes can be pushed.
73
+ Returns `None` otherwise.
74
+ """
75
+ if self.is_pull_request:
76
+ return f"refs/pr/{self.num}"
77
+ return None
78
+
79
+ @property
80
+ def url(self) -> str:
81
+ """Returns the URL of the discussion on the Hub."""
82
+ if self.repo_type is None or self.repo_type == constants.REPO_TYPE_MODEL:
83
+ return f"{self.endpoint}/{self.repo_id}/discussions/{self.num}"
84
+ return f"{self.endpoint}/{self.repo_type}s/{self.repo_id}/discussions/{self.num}"
85
+
86
+
87
+ @dataclass
88
+ class DiscussionWithDetails(Discussion):
89
+ """
90
+ Subclass of [`Discussion`].
91
+
92
+ Attributes:
93
+ title (`str`):
94
+ The title of the Discussion / Pull Request
95
+ status (`str`):
96
+ The status of the Discussion / Pull Request.
97
+ It can be one of:
98
+ * `"open"`
99
+ * `"closed"`
100
+ * `"merged"` (only for Pull Requests )
101
+ * `"draft"` (only for Pull Requests )
102
+ num (`int`):
103
+ The number of the Discussion / Pull Request.
104
+ repo_id (`str`):
105
+ The id (`"{namespace}/{repo_name}"`) of the repo on which
106
+ the Discussion / Pull Request was open.
107
+ repo_type (`str`):
108
+ The type of the repo on which the Discussion / Pull Request was open.
109
+ Possible values are: `"model"`, `"dataset"`, `"space"`.
110
+ author (`str`):
111
+ The username of the Discussion / Pull Request author.
112
+ Can be `"deleted"` if the user has been deleted since.
113
+ is_pull_request (`bool`):
114
+ Whether or not this is a Pull Request.
115
+ created_at (`datetime`):
116
+ The `datetime` of creation of the Discussion / Pull Request.
117
+ events (`list` of [`DiscussionEvent`])
118
+ The list of [`DiscussionEvents`] in this Discussion or Pull Request.
119
+ conflicting_files (`Union[List[str], bool, None]`, *optional*):
120
+ A list of conflicting files if this is a Pull Request.
121
+ `None` if `self.is_pull_request` is `False`.
122
+ `True` if there are conflicting files but the list can't be retrieved.
123
+ target_branch (`str`, *optional*):
124
+ The branch into which changes are to be merged if this is a
125
+ Pull Request . `None` if `self.is_pull_request` is `False`.
126
+ merge_commit_oid (`str`, *optional*):
127
+ If this is a merged Pull Request , this is set to the OID / SHA of
128
+ the merge commit, `None` otherwise.
129
+ diff (`str`, *optional*):
130
+ The git diff if this is a Pull Request , `None` otherwise.
131
+ endpoint (`str`):
132
+ Endpoint of the Hub. Default is https://huggingface.co.
133
+ git_reference (`str`, *optional*):
134
+ (property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
135
+ url (`str`):
136
+ (property) URL of the discussion on the Hub.
137
+ """
138
+
139
+ events: List["DiscussionEvent"]
140
+ conflicting_files: Union[List[str], bool, None]
141
+ target_branch: Optional[str]
142
+ merge_commit_oid: Optional[str]
143
+ diff: Optional[str]
144
+
145
+
146
+ class DiscussionEventArgs(TypedDict):
147
+ id: str
148
+ type: str
149
+ created_at: datetime
150
+ author: str
151
+ _event: dict
152
+
153
+
154
+ @dataclass
155
+ class DiscussionEvent:
156
+ """
157
+ An event in a Discussion or Pull Request.
158
+
159
+ Use concrete classes:
160
+ * [`DiscussionComment`]
161
+ * [`DiscussionStatusChange`]
162
+ * [`DiscussionCommit`]
163
+ * [`DiscussionTitleChange`]
164
+
165
+ Attributes:
166
+ id (`str`):
167
+ The ID of the event. An hexadecimal string.
168
+ type (`str`):
169
+ The type of the event.
170
+ created_at (`datetime`):
171
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
172
+ object holding the creation timestamp for the event.
173
+ author (`str`):
174
+ The username of the Discussion / Pull Request author.
175
+ Can be `"deleted"` if the user has been deleted since.
176
+ """
177
+
178
+ id: str
179
+ type: str
180
+ created_at: datetime
181
+ author: str
182
+
183
+ _event: dict
184
+ """Stores the original event data, in case we need to access it later."""
185
+
186
+
187
+ @dataclass
188
+ class DiscussionComment(DiscussionEvent):
189
+ """A comment in a Discussion / Pull Request.
190
+
191
+ Subclass of [`DiscussionEvent`].
192
+
193
+
194
+ Attributes:
195
+ id (`str`):
196
+ The ID of the event. An hexadecimal string.
197
+ type (`str`):
198
+ The type of the event.
199
+ created_at (`datetime`):
200
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
201
+ object holding the creation timestamp for the event.
202
+ author (`str`):
203
+ The username of the Discussion / Pull Request author.
204
+ Can be `"deleted"` if the user has been deleted since.
205
+ content (`str`):
206
+ The raw markdown content of the comment. Mentions, links and images are not rendered.
207
+ edited (`bool`):
208
+ Whether or not this comment has been edited.
209
+ hidden (`bool`):
210
+ Whether or not this comment has been hidden.
211
+ """
212
+
213
+ content: str
214
+ edited: bool
215
+ hidden: bool
216
+
217
+ @property
218
+ def rendered(self) -> str:
219
+ """The rendered comment, as a HTML string"""
220
+ return self._event["data"]["latest"]["html"]
221
+
222
+ @property
223
+ def last_edited_at(self) -> datetime:
224
+ """The last edit time, as a `datetime` object."""
225
+ return parse_datetime(self._event["data"]["latest"]["updatedAt"])
226
+
227
+ @property
228
+ def last_edited_by(self) -> str:
229
+ """The last edit time, as a `datetime` object."""
230
+ return self._event["data"]["latest"].get("author", {}).get("name", "deleted")
231
+
232
+ @property
233
+ def edit_history(self) -> List[dict]:
234
+ """The edit history of the comment"""
235
+ return self._event["data"]["history"]
236
+
237
+ @property
238
+ def number_of_edits(self) -> int:
239
+ return len(self.edit_history)
240
+
241
+
242
+ @dataclass
243
+ class DiscussionStatusChange(DiscussionEvent):
244
+ """A change of status in a Discussion / Pull Request.
245
+
246
+ Subclass of [`DiscussionEvent`].
247
+
248
+ Attributes:
249
+ id (`str`):
250
+ The ID of the event. An hexadecimal string.
251
+ type (`str`):
252
+ The type of the event.
253
+ created_at (`datetime`):
254
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
255
+ object holding the creation timestamp for the event.
256
+ author (`str`):
257
+ The username of the Discussion / Pull Request author.
258
+ Can be `"deleted"` if the user has been deleted since.
259
+ new_status (`str`):
260
+ The status of the Discussion / Pull Request after the change.
261
+ It can be one of:
262
+ * `"open"`
263
+ * `"closed"`
264
+ * `"merged"` (only for Pull Requests )
265
+ """
266
+
267
+ new_status: str
268
+
269
+
270
+ @dataclass
271
+ class DiscussionCommit(DiscussionEvent):
272
+ """A commit in a Pull Request.
273
+
274
+ Subclass of [`DiscussionEvent`].
275
+
276
+ Attributes:
277
+ id (`str`):
278
+ The ID of the event. An hexadecimal string.
279
+ type (`str`):
280
+ The type of the event.
281
+ created_at (`datetime`):
282
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
283
+ object holding the creation timestamp for the event.
284
+ author (`str`):
285
+ The username of the Discussion / Pull Request author.
286
+ Can be `"deleted"` if the user has been deleted since.
287
+ summary (`str`):
288
+ The summary of the commit.
289
+ oid (`str`):
290
+ The OID / SHA of the commit, as a hexadecimal string.
291
+ """
292
+
293
+ summary: str
294
+ oid: str
295
+
296
+
297
+ @dataclass
298
+ class DiscussionTitleChange(DiscussionEvent):
299
+ """A rename event in a Discussion / Pull Request.
300
+
301
+ Subclass of [`DiscussionEvent`].
302
+
303
+ Attributes:
304
+ id (`str`):
305
+ The ID of the event. An hexadecimal string.
306
+ type (`str`):
307
+ The type of the event.
308
+ created_at (`datetime`):
309
+ A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
310
+ object holding the creation timestamp for the event.
311
+ author (`str`):
312
+ The username of the Discussion / Pull Request author.
313
+ Can be `"deleted"` if the user has been deleted since.
314
+ old_title (`str`):
315
+ The previous title for the Discussion / Pull Request.
316
+ new_title (`str`):
317
+ The new title.
318
+ """
319
+
320
+ old_title: str
321
+ new_title: str
322
+
323
+
324
+ def deserialize_event(event: dict) -> DiscussionEvent:
325
+ """Instantiates a [`DiscussionEvent`] from a dict"""
326
+ event_id: str = event["id"]
327
+ event_type: str = event["type"]
328
+ created_at = parse_datetime(event["createdAt"])
329
+
330
+ common_args: DiscussionEventArgs = {
331
+ "id": event_id,
332
+ "type": event_type,
333
+ "created_at": created_at,
334
+ "author": event.get("author", {}).get("name", "deleted"),
335
+ "_event": event,
336
+ }
337
+
338
+ if event_type == "comment":
339
+ return DiscussionComment(
340
+ **common_args,
341
+ edited=event["data"]["edited"],
342
+ hidden=event["data"]["hidden"],
343
+ content=event["data"]["latest"]["raw"],
344
+ )
345
+ if event_type == "status-change":
346
+ return DiscussionStatusChange(
347
+ **common_args,
348
+ new_status=event["data"]["status"],
349
+ )
350
+ if event_type == "commit":
351
+ return DiscussionCommit(
352
+ **common_args,
353
+ summary=event["data"]["subject"],
354
+ oid=event["data"]["oid"],
355
+ )
356
+ if event_type == "title-change":
357
+ return DiscussionTitleChange(
358
+ **common_args,
359
+ old_title=event["data"]["from"],
360
+ new_title=event["data"]["to"],
361
+ )
362
+
363
+ return DiscussionEvent(**common_args)
venv/lib/python3.13/site-packages/huggingface_hub/constants.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import typing
4
+ from typing import Literal, Optional, Tuple
5
+
6
+
7
+ # Possible values for env variables
8
+
9
+
10
+ ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
11
+ ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
12
+
13
+
14
+ def _is_true(value: Optional[str]) -> bool:
15
+ if value is None:
16
+ return False
17
+ return value.upper() in ENV_VARS_TRUE_VALUES
18
+
19
+
20
+ def _as_int(value: Optional[str]) -> Optional[int]:
21
+ if value is None:
22
+ return None
23
+ return int(value)
24
+
25
+
26
+ # Constants for file downloads
27
+
28
+ PYTORCH_WEIGHTS_NAME = "pytorch_model.bin"
29
+ TF2_WEIGHTS_NAME = "tf_model.h5"
30
+ TF_WEIGHTS_NAME = "model.ckpt"
31
+ FLAX_WEIGHTS_NAME = "flax_model.msgpack"
32
+ CONFIG_NAME = "config.json"
33
+ REPOCARD_NAME = "README.md"
34
+ DEFAULT_ETAG_TIMEOUT = 10
35
+ DEFAULT_DOWNLOAD_TIMEOUT = 10
36
+ DEFAULT_REQUEST_TIMEOUT = 10
37
+ DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
38
+ HF_TRANSFER_CONCURRENCY = 100
39
+ MAX_HTTP_DOWNLOAD_SIZE = 50 * 1000 * 1000 * 1000 # 50 GB
40
+
41
+ # Constants for serialization
42
+
43
+ PYTORCH_WEIGHTS_FILE_PATTERN = "pytorch_model{suffix}.bin" # Unsafe pickle: use safetensors instead
44
+ SAFETENSORS_WEIGHTS_FILE_PATTERN = "model{suffix}.safetensors"
45
+ TF2_WEIGHTS_FILE_PATTERN = "tf_model{suffix}.h5"
46
+
47
+ # Constants for safetensors repos
48
+
49
+ SAFETENSORS_SINGLE_FILE = "model.safetensors"
50
+ SAFETENSORS_INDEX_FILE = "model.safetensors.index.json"
51
+ SAFETENSORS_MAX_HEADER_LENGTH = 25_000_000
52
+
53
+ # Timeout of aquiring file lock and logging the attempt
54
+ FILELOCK_LOG_EVERY_SECONDS = 10
55
+
56
+ # Git-related constants
57
+
58
+ DEFAULT_REVISION = "main"
59
+ REGEX_COMMIT_OID = re.compile(r"[A-Fa-f0-9]{5,40}")
60
+
61
+ HUGGINGFACE_CO_URL_HOME = "https://huggingface.co/"
62
+
63
+ _staging_mode = _is_true(os.environ.get("HUGGINGFACE_CO_STAGING"))
64
+
65
+ _HF_DEFAULT_ENDPOINT = "https://huggingface.co"
66
+ _HF_DEFAULT_STAGING_ENDPOINT = "https://hub-ci.huggingface.co"
67
+ ENDPOINT = os.getenv("HF_ENDPOINT", _HF_DEFAULT_ENDPOINT).rstrip("/")
68
+ HUGGINGFACE_CO_URL_TEMPLATE = ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
69
+
70
+ if _staging_mode:
71
+ ENDPOINT = _HF_DEFAULT_STAGING_ENDPOINT
72
+ HUGGINGFACE_CO_URL_TEMPLATE = _HF_DEFAULT_STAGING_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
73
+
74
+ HUGGINGFACE_HEADER_X_REPO_COMMIT = "X-Repo-Commit"
75
+ HUGGINGFACE_HEADER_X_LINKED_ETAG = "X-Linked-Etag"
76
+ HUGGINGFACE_HEADER_X_LINKED_SIZE = "X-Linked-Size"
77
+ HUGGINGFACE_HEADER_X_BILL_TO = "X-HF-Bill-To"
78
+
79
+ INFERENCE_ENDPOINT = os.environ.get("HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co")
80
+
81
+ # See https://huggingface.co/docs/inference-endpoints/index
82
+ INFERENCE_ENDPOINTS_ENDPOINT = "https://api.endpoints.huggingface.cloud/v2"
83
+ INFERENCE_CATALOG_ENDPOINT = "https://endpoints.huggingface.co/api/catalog"
84
+
85
+ # See https://api.endpoints.huggingface.cloud/#post-/v2/endpoint/-namespace-
86
+ INFERENCE_ENDPOINT_IMAGE_KEYS = [
87
+ "custom",
88
+ "huggingface",
89
+ "huggingfaceNeuron",
90
+ "llamacpp",
91
+ "tei",
92
+ "tgi",
93
+ "tgiNeuron",
94
+ ]
95
+
96
+ # Proxy for third-party providers
97
+ INFERENCE_PROXY_TEMPLATE = "https://router.huggingface.co/{provider}"
98
+
99
+ REPO_ID_SEPARATOR = "--"
100
+ # ^ this substring is not allowed in repo_ids on hf.co
101
+ # and is the canonical one we use for serialization of repo ids elsewhere.
102
+
103
+
104
+ REPO_TYPE_DATASET = "dataset"
105
+ REPO_TYPE_SPACE = "space"
106
+ REPO_TYPE_MODEL = "model"
107
+ REPO_TYPES = [None, REPO_TYPE_MODEL, REPO_TYPE_DATASET, REPO_TYPE_SPACE]
108
+ SPACES_SDK_TYPES = ["gradio", "streamlit", "docker", "static"]
109
+
110
+ REPO_TYPES_URL_PREFIXES = {
111
+ REPO_TYPE_DATASET: "datasets/",
112
+ REPO_TYPE_SPACE: "spaces/",
113
+ }
114
+ REPO_TYPES_MAPPING = {
115
+ "datasets": REPO_TYPE_DATASET,
116
+ "spaces": REPO_TYPE_SPACE,
117
+ "models": REPO_TYPE_MODEL,
118
+ }
119
+
120
+ DiscussionTypeFilter = Literal["all", "discussion", "pull_request"]
121
+ DISCUSSION_TYPES: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionTypeFilter)
122
+ DiscussionStatusFilter = Literal["all", "open", "closed"]
123
+ DISCUSSION_STATUS: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionStatusFilter)
124
+
125
+ # Webhook subscription types
126
+ WEBHOOK_DOMAIN_T = Literal["repo", "discussions"]
127
+
128
+ # default cache
129
+ default_home = os.path.join(os.path.expanduser("~"), ".cache")
130
+ HF_HOME = os.path.expandvars(
131
+ os.path.expanduser(
132
+ os.getenv(
133
+ "HF_HOME",
134
+ os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "huggingface"),
135
+ )
136
+ )
137
+ )
138
+ hf_cache_home = HF_HOME # for backward compatibility. TODO: remove this in 1.0.0
139
+
140
+ default_cache_path = os.path.join(HF_HOME, "hub")
141
+ default_assets_cache_path = os.path.join(HF_HOME, "assets")
142
+
143
+ # Legacy env variables
144
+ HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path)
145
+ HUGGINGFACE_ASSETS_CACHE = os.getenv("HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path)
146
+
147
+ # New env variables
148
+ HF_HUB_CACHE = os.path.expandvars(
149
+ os.path.expanduser(
150
+ os.getenv(
151
+ "HF_HUB_CACHE",
152
+ HUGGINGFACE_HUB_CACHE,
153
+ )
154
+ )
155
+ )
156
+ HF_ASSETS_CACHE = os.path.expandvars(
157
+ os.path.expanduser(
158
+ os.getenv(
159
+ "HF_ASSETS_CACHE",
160
+ HUGGINGFACE_ASSETS_CACHE,
161
+ )
162
+ )
163
+ )
164
+
165
+ HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE") or os.environ.get("TRANSFORMERS_OFFLINE"))
166
+
167
+ # If set, log level will be set to DEBUG and all requests made to the Hub will be logged
168
+ # as curl commands for reproducibility.
169
+ HF_DEBUG = _is_true(os.environ.get("HF_DEBUG"))
170
+
171
+ # Opt-out from telemetry requests
172
+ HF_HUB_DISABLE_TELEMETRY = (
173
+ _is_true(os.environ.get("HF_HUB_DISABLE_TELEMETRY")) # HF-specific env variable
174
+ or _is_true(os.environ.get("DISABLE_TELEMETRY"))
175
+ or _is_true(os.environ.get("DO_NOT_TRACK")) # https://consoledonottrack.com/
176
+ )
177
+
178
+ HF_TOKEN_PATH = os.path.expandvars(
179
+ os.path.expanduser(
180
+ os.getenv(
181
+ "HF_TOKEN_PATH",
182
+ os.path.join(HF_HOME, "token"),
183
+ )
184
+ )
185
+ )
186
+ HF_STORED_TOKENS_PATH = os.path.join(os.path.dirname(HF_TOKEN_PATH), "stored_tokens")
187
+
188
+ if _staging_mode:
189
+ # In staging mode, we use a different cache to ensure we don't mix up production and staging data or tokens
190
+ # In practice in `huggingface_hub` tests, we monkeypatch these values with temporary directories. The following
191
+ # lines are only used in third-party libraries tests (e.g. `transformers`, `diffusers`, etc.).
192
+ _staging_home = os.path.join(os.path.expanduser("~"), ".cache", "huggingface_staging")
193
+ HUGGINGFACE_HUB_CACHE = os.path.join(_staging_home, "hub")
194
+ HF_TOKEN_PATH = os.path.join(_staging_home, "token")
195
+
196
+ # Here, `True` will disable progress bars globally without possibility of enabling it
197
+ # programmatically. `False` will enable them without possibility of disabling them.
198
+ # If environment variable is not set (None), then the user is free to enable/disable
199
+ # them programmatically.
200
+ # TL;DR: env variable has priority over code
201
+ __HF_HUB_DISABLE_PROGRESS_BARS = os.environ.get("HF_HUB_DISABLE_PROGRESS_BARS")
202
+ HF_HUB_DISABLE_PROGRESS_BARS: Optional[bool] = (
203
+ _is_true(__HF_HUB_DISABLE_PROGRESS_BARS) if __HF_HUB_DISABLE_PROGRESS_BARS is not None else None
204
+ )
205
+
206
+ # Disable warning on machines that do not support symlinks (e.g. Windows non-developer)
207
+ HF_HUB_DISABLE_SYMLINKS_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_SYMLINKS_WARNING"))
208
+
209
+ # Disable warning when using experimental features
210
+ HF_HUB_DISABLE_EXPERIMENTAL_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_EXPERIMENTAL_WARNING"))
211
+
212
+ # Disable sending the cached token by default is all HTTP requests to the Hub
213
+ HF_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("HF_HUB_DISABLE_IMPLICIT_TOKEN"))
214
+
215
+ # Enable fast-download using external dependency "hf_transfer"
216
+ # See:
217
+ # - https://pypi.org/project/hf-transfer/
218
+ # - https://github.com/huggingface/hf_transfer (private)
219
+ HF_HUB_ENABLE_HF_TRANSFER: bool = _is_true(os.environ.get("HF_HUB_ENABLE_HF_TRANSFER"))
220
+
221
+
222
+ # UNUSED
223
+ # We don't use symlinks in local dir anymore.
224
+ HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = (
225
+ _as_int(os.environ.get("HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD")) or 5 * 1024 * 1024
226
+ )
227
+
228
+ # Used to override the etag timeout on a system level
229
+ HF_HUB_ETAG_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT")) or DEFAULT_ETAG_TIMEOUT
230
+
231
+ # Used to override the get request timeout on a system level
232
+ HF_HUB_DOWNLOAD_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_DOWNLOAD_TIMEOUT")) or DEFAULT_DOWNLOAD_TIMEOUT
233
+
234
+ # Allows to add information about the requester in the user-agent (eg. partner name)
235
+ HF_HUB_USER_AGENT_ORIGIN: Optional[str] = os.environ.get("HF_HUB_USER_AGENT_ORIGIN")
236
+
237
+ # List frameworks that are handled by the InferenceAPI service. Useful to scan endpoints and check which models are
238
+ # deployed and running. Since 95% of the models are using the top 4 frameworks listed below, we scan only those by
239
+ # default. We still keep the full list of supported frameworks in case we want to scan all of them.
240
+ MAIN_INFERENCE_API_FRAMEWORKS = [
241
+ "diffusers",
242
+ "sentence-transformers",
243
+ "text-generation-inference",
244
+ "transformers",
245
+ ]
246
+
247
+ ALL_INFERENCE_API_FRAMEWORKS = MAIN_INFERENCE_API_FRAMEWORKS + [
248
+ "adapter-transformers",
249
+ "allennlp",
250
+ "asteroid",
251
+ "bertopic",
252
+ "doctr",
253
+ "espnet",
254
+ "fairseq",
255
+ "fastai",
256
+ "fasttext",
257
+ "flair",
258
+ "k2",
259
+ "keras",
260
+ "mindspore",
261
+ "nemo",
262
+ "open_clip",
263
+ "paddlenlp",
264
+ "peft",
265
+ "pyannote-audio",
266
+ "sklearn",
267
+ "spacy",
268
+ "span-marker",
269
+ "speechbrain",
270
+ "stanza",
271
+ "timm",
272
+ ]
273
+
274
+ # If OAuth didn't work after 2 redirects, there's likely a third-party cookie issue in the Space iframe view.
275
+ # In this case, we redirect the user to the non-iframe view.
276
+ OAUTH_MAX_REDIRECTS = 2
277
+
278
+ # OAuth-related environment variables injected by the Space
279
+ OAUTH_CLIENT_ID = os.environ.get("OAUTH_CLIENT_ID")
280
+ OAUTH_CLIENT_SECRET = os.environ.get("OAUTH_CLIENT_SECRET")
281
+ OAUTH_SCOPES = os.environ.get("OAUTH_SCOPES")
282
+ OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL")
283
+
284
+ # Xet constants
285
+ HUGGINGFACE_HEADER_X_XET_ENDPOINT = "X-Xet-Cas-Url"
286
+ HUGGINGFACE_HEADER_X_XET_ACCESS_TOKEN = "X-Xet-Access-Token"
287
+ HUGGINGFACE_HEADER_X_XET_EXPIRATION = "X-Xet-Token-Expiration"
288
+ HUGGINGFACE_HEADER_X_XET_HASH = "X-Xet-Hash"
289
+ HUGGINGFACE_HEADER_X_XET_REFRESH_ROUTE = "X-Xet-Refresh-Route"
290
+ HUGGINGFACE_HEADER_LINK_XET_AUTH_KEY = "xet-auth"
291
+
292
+ default_xet_cache_path = os.path.join(HF_HOME, "xet")
293
+ HF_XET_CACHE = os.getenv("HF_XET_CACHE", default_xet_cache_path)
294
+ HF_HUB_DISABLE_XET: bool = _is_true(os.environ.get("HF_HUB_DISABLE_XET"))
venv/lib/python3.13/site-packages/huggingface_hub/dataclasses.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ from dataclasses import _MISSING_TYPE, MISSING, Field, field, fields
3
+ from functools import wraps
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ Dict,
8
+ ForwardRef,
9
+ List,
10
+ Literal,
11
+ Optional,
12
+ Tuple,
13
+ Type,
14
+ TypeVar,
15
+ Union,
16
+ get_args,
17
+ get_origin,
18
+ overload,
19
+ )
20
+
21
+ from .errors import (
22
+ StrictDataclassClassValidationError,
23
+ StrictDataclassDefinitionError,
24
+ StrictDataclassFieldValidationError,
25
+ )
26
+
27
+
28
+ Validator_T = Callable[[Any], None]
29
+ T = TypeVar("T")
30
+
31
+
32
+ # The overload decorator helps type checkers understand the different return types
33
+ @overload
34
+ def strict(cls: Type[T]) -> Type[T]: ...
35
+
36
+
37
+ @overload
38
+ def strict(*, accept_kwargs: bool = False) -> Callable[[Type[T]], Type[T]]: ...
39
+
40
+
41
+ def strict(
42
+ cls: Optional[Type[T]] = None, *, accept_kwargs: bool = False
43
+ ) -> Union[Type[T], Callable[[Type[T]], Type[T]]]:
44
+ """
45
+ Decorator to add strict validation to a dataclass.
46
+
47
+ This decorator must be used on top of `@dataclass` to ensure IDEs and static typing tools
48
+ recognize the class as a dataclass.
49
+
50
+ Can be used with or without arguments:
51
+ - `@strict`
52
+ - `@strict(accept_kwargs=True)`
53
+
54
+ Args:
55
+ cls:
56
+ The class to convert to a strict dataclass.
57
+ accept_kwargs (`bool`, *optional*):
58
+ If True, allows arbitrary keyword arguments in `__init__`. Defaults to False.
59
+
60
+ Returns:
61
+ The enhanced dataclass with strict validation on field assignment.
62
+
63
+ Example:
64
+ ```py
65
+ >>> from dataclasses import dataclass
66
+ >>> from huggingface_hub.dataclasses import as_validated_field, strict, validated_field
67
+
68
+ >>> @as_validated_field
69
+ >>> def positive_int(value: int):
70
+ ... if not value >= 0:
71
+ ... raise ValueError(f"Value must be positive, got {value}")
72
+
73
+ >>> @strict(accept_kwargs=True)
74
+ ... @dataclass
75
+ ... class User:
76
+ ... name: str
77
+ ... age: int = positive_int(default=10)
78
+
79
+ # Initialize
80
+ >>> User(name="John")
81
+ User(name='John', age=10)
82
+
83
+ # Extra kwargs are accepted
84
+ >>> User(name="John", age=30, lastname="Doe")
85
+ User(name='John', age=30, *lastname='Doe')
86
+
87
+ # Invalid type => raises
88
+ >>> User(name="John", age="30")
89
+ huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age':
90
+ TypeError: Field 'age' expected int, got str (value: '30')
91
+
92
+ # Invalid value => raises
93
+ >>> User(name="John", age=-1)
94
+ huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age':
95
+ ValueError: Value must be positive, got -1
96
+ ```
97
+ """
98
+
99
+ def wrap(cls: Type[T]) -> Type[T]:
100
+ if not hasattr(cls, "__dataclass_fields__"):
101
+ raise StrictDataclassDefinitionError(
102
+ f"Class '{cls.__name__}' must be a dataclass before applying @strict."
103
+ )
104
+
105
+ # List and store validators
106
+ field_validators: Dict[str, List[Validator_T]] = {}
107
+ for f in fields(cls): # type: ignore [arg-type]
108
+ validators = []
109
+ validators.append(_create_type_validator(f))
110
+ custom_validator = f.metadata.get("validator")
111
+ if custom_validator is not None:
112
+ if not isinstance(custom_validator, list):
113
+ custom_validator = [custom_validator]
114
+ for validator in custom_validator:
115
+ if not _is_validator(validator):
116
+ raise StrictDataclassDefinitionError(
117
+ f"Invalid validator for field '{f.name}': {validator}. Must be a callable taking a single argument."
118
+ )
119
+ validators.extend(custom_validator)
120
+ field_validators[f.name] = validators
121
+ cls.__validators__ = field_validators # type: ignore
122
+
123
+ # Override __setattr__ to validate fields on assignment
124
+ original_setattr = cls.__setattr__
125
+
126
+ def __strict_setattr__(self: Any, name: str, value: Any) -> None:
127
+ """Custom __setattr__ method for strict dataclasses."""
128
+ # Run all validators
129
+ for validator in self.__validators__.get(name, []):
130
+ try:
131
+ validator(value)
132
+ except (ValueError, TypeError) as e:
133
+ raise StrictDataclassFieldValidationError(field=name, cause=e) from e
134
+
135
+ # If validation passed, set the attribute
136
+ original_setattr(self, name, value)
137
+
138
+ cls.__setattr__ = __strict_setattr__ # type: ignore[method-assign]
139
+
140
+ if accept_kwargs:
141
+ # (optional) Override __init__ to accept arbitrary keyword arguments
142
+ original_init = cls.__init__
143
+
144
+ @wraps(original_init)
145
+ def __init__(self, **kwargs: Any) -> None:
146
+ # Extract only the fields that are part of the dataclass
147
+ dataclass_fields = {f.name for f in fields(cls)} # type: ignore [arg-type]
148
+ standard_kwargs = {k: v for k, v in kwargs.items() if k in dataclass_fields}
149
+
150
+ # Call the original __init__ with standard fields
151
+ original_init(self, **standard_kwargs)
152
+
153
+ # Add any additional kwargs as attributes
154
+ for name, value in kwargs.items():
155
+ if name not in dataclass_fields:
156
+ self.__setattr__(name, value)
157
+
158
+ cls.__init__ = __init__ # type: ignore[method-assign]
159
+
160
+ # (optional) Override __repr__ to include additional kwargs
161
+ original_repr = cls.__repr__
162
+
163
+ @wraps(original_repr)
164
+ def __repr__(self) -> str:
165
+ # Call the original __repr__ to get the standard fields
166
+ standard_repr = original_repr(self)
167
+
168
+ # Get additional kwargs
169
+ additional_kwargs = [
170
+ # add a '*' in front of additional kwargs to let the user know they are not part of the dataclass
171
+ f"*{k}={v!r}"
172
+ for k, v in self.__dict__.items()
173
+ if k not in cls.__dataclass_fields__ # type: ignore [attr-defined]
174
+ ]
175
+ additional_repr = ", ".join(additional_kwargs)
176
+
177
+ # Combine both representations
178
+ return f"{standard_repr[:-1]}, {additional_repr})" if additional_kwargs else standard_repr
179
+
180
+ cls.__repr__ = __repr__ # type: ignore [method-assign]
181
+
182
+ # List all public methods starting with `validate_` => class validators.
183
+ class_validators = []
184
+
185
+ for name in dir(cls):
186
+ if not name.startswith("validate_"):
187
+ continue
188
+ method = getattr(cls, name)
189
+ if not callable(method):
190
+ continue
191
+ if len(inspect.signature(method).parameters) != 1:
192
+ raise StrictDataclassDefinitionError(
193
+ f"Class '{cls.__name__}' has a class validator '{name}' that takes more than one argument."
194
+ " Class validators must take only 'self' as an argument. Methods starting with 'validate_'"
195
+ " are considered to be class validators."
196
+ )
197
+ class_validators.append(method)
198
+
199
+ cls.__class_validators__ = class_validators # type: ignore [attr-defined]
200
+
201
+ # Add `validate` method to the class, but first check if it already exists
202
+ def validate(self: T) -> None:
203
+ """Run class validators on the instance."""
204
+ for validator in cls.__class_validators__: # type: ignore [attr-defined]
205
+ try:
206
+ validator(self)
207
+ except (ValueError, TypeError) as e:
208
+ raise StrictDataclassClassValidationError(validator=validator.__name__, cause=e) from e
209
+
210
+ # Hack to be able to raise if `.validate()` already exists except if it was created by this decorator on a parent class
211
+ # (in which case we just override it)
212
+ validate.__is_defined_by_strict_decorator__ = True # type: ignore [attr-defined]
213
+
214
+ if hasattr(cls, "validate"):
215
+ if not getattr(cls.validate, "__is_defined_by_strict_decorator__", False): # type: ignore [attr-defined]
216
+ raise StrictDataclassDefinitionError(
217
+ f"Class '{cls.__name__}' already implements a method called 'validate'."
218
+ " This method name is reserved when using the @strict decorator on a dataclass."
219
+ " If you want to keep your own method, please rename it."
220
+ )
221
+
222
+ cls.validate = validate # type: ignore
223
+
224
+ # Run class validators after initialization
225
+ initial_init = cls.__init__
226
+
227
+ @wraps(initial_init)
228
+ def init_with_validate(self, *args, **kwargs) -> None:
229
+ """Run class validators after initialization."""
230
+ initial_init(self, *args, **kwargs) # type: ignore [call-arg]
231
+ cls.validate(self) # type: ignore [attr-defined]
232
+
233
+ setattr(cls, "__init__", init_with_validate)
234
+
235
+ return cls
236
+
237
+ # Return wrapped class or the decorator itself
238
+ return wrap(cls) if cls is not None else wrap
239
+
240
+
241
+ def validated_field(
242
+ validator: Union[List[Validator_T], Validator_T],
243
+ default: Union[Any, _MISSING_TYPE] = MISSING,
244
+ default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING,
245
+ init: bool = True,
246
+ repr: bool = True,
247
+ hash: Optional[bool] = None,
248
+ compare: bool = True,
249
+ metadata: Optional[Dict] = None,
250
+ **kwargs: Any,
251
+ ) -> Any:
252
+ """
253
+ Create a dataclass field with a custom validator.
254
+
255
+ Useful to apply several checks to a field. If only applying one rule, check out the [`as_validated_field`] decorator.
256
+
257
+ Args:
258
+ validator (`Callable` or `List[Callable]`):
259
+ A method that takes a value as input and raises ValueError/TypeError if the value is invalid.
260
+ Can be a list of validators to apply multiple checks.
261
+ **kwargs:
262
+ Additional arguments to pass to `dataclasses.field()`.
263
+
264
+ Returns:
265
+ A field with the validator attached in metadata
266
+ """
267
+ if not isinstance(validator, list):
268
+ validator = [validator]
269
+ if metadata is None:
270
+ metadata = {}
271
+ metadata["validator"] = validator
272
+ return field( # type: ignore
273
+ default=default, # type: ignore [arg-type]
274
+ default_factory=default_factory, # type: ignore [arg-type]
275
+ init=init,
276
+ repr=repr,
277
+ hash=hash,
278
+ compare=compare,
279
+ metadata=metadata,
280
+ **kwargs,
281
+ )
282
+
283
+
284
+ def as_validated_field(validator: Validator_T):
285
+ """
286
+ Decorates a validator function as a [`validated_field`] (i.e. a dataclass field with a custom validator).
287
+
288
+ Args:
289
+ validator (`Callable`):
290
+ A method that takes a value as input and raises ValueError/TypeError if the value is invalid.
291
+ """
292
+
293
+ def _inner(
294
+ default: Union[Any, _MISSING_TYPE] = MISSING,
295
+ default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING,
296
+ init: bool = True,
297
+ repr: bool = True,
298
+ hash: Optional[bool] = None,
299
+ compare: bool = True,
300
+ metadata: Optional[Dict] = None,
301
+ **kwargs: Any,
302
+ ):
303
+ return validated_field(
304
+ validator,
305
+ default=default,
306
+ default_factory=default_factory,
307
+ init=init,
308
+ repr=repr,
309
+ hash=hash,
310
+ compare=compare,
311
+ metadata=metadata,
312
+ **kwargs,
313
+ )
314
+
315
+ return _inner
316
+
317
+
318
+ def type_validator(name: str, value: Any, expected_type: Any) -> None:
319
+ """Validate that 'value' matches 'expected_type'."""
320
+ origin = get_origin(expected_type)
321
+ args = get_args(expected_type)
322
+
323
+ if expected_type is Any:
324
+ return
325
+ elif validator := _BASIC_TYPE_VALIDATORS.get(origin):
326
+ validator(name, value, args)
327
+ elif isinstance(expected_type, type): # simple types
328
+ _validate_simple_type(name, value, expected_type)
329
+ elif isinstance(expected_type, ForwardRef) or isinstance(expected_type, str):
330
+ return
331
+ else:
332
+ raise TypeError(f"Unsupported type for field '{name}': {expected_type}")
333
+
334
+
335
+ def _validate_union(name: str, value: Any, args: Tuple[Any, ...]) -> None:
336
+ """Validate that value matches one of the types in a Union."""
337
+ errors = []
338
+ for t in args:
339
+ try:
340
+ type_validator(name, value, t)
341
+ return # Valid if any type matches
342
+ except TypeError as e:
343
+ errors.append(str(e))
344
+
345
+ raise TypeError(
346
+ f"Field '{name}' with value {repr(value)} doesn't match any type in {args}. Errors: {'; '.join(errors)}"
347
+ )
348
+
349
+
350
+ def _validate_literal(name: str, value: Any, args: Tuple[Any, ...]) -> None:
351
+ """Validate Literal type."""
352
+ if value not in args:
353
+ raise TypeError(f"Field '{name}' expected one of {args}, got {value}")
354
+
355
+
356
+ def _validate_list(name: str, value: Any, args: Tuple[Any, ...]) -> None:
357
+ """Validate List[T] type."""
358
+ if not isinstance(value, list):
359
+ raise TypeError(f"Field '{name}' expected a list, got {type(value).__name__}")
360
+
361
+ # Validate each item in the list
362
+ item_type = args[0]
363
+ for i, item in enumerate(value):
364
+ try:
365
+ type_validator(f"{name}[{i}]", item, item_type)
366
+ except TypeError as e:
367
+ raise TypeError(f"Invalid item at index {i} in list '{name}'") from e
368
+
369
+
370
+ def _validate_dict(name: str, value: Any, args: Tuple[Any, ...]) -> None:
371
+ """Validate Dict[K, V] type."""
372
+ if not isinstance(value, dict):
373
+ raise TypeError(f"Field '{name}' expected a dict, got {type(value).__name__}")
374
+
375
+ # Validate keys and values
376
+ key_type, value_type = args
377
+ for k, v in value.items():
378
+ try:
379
+ type_validator(f"{name}.key", k, key_type)
380
+ type_validator(f"{name}[{k!r}]", v, value_type)
381
+ except TypeError as e:
382
+ raise TypeError(f"Invalid key or value in dict '{name}'") from e
383
+
384
+
385
+ def _validate_tuple(name: str, value: Any, args: Tuple[Any, ...]) -> None:
386
+ """Validate Tuple type."""
387
+ if not isinstance(value, tuple):
388
+ raise TypeError(f"Field '{name}' expected a tuple, got {type(value).__name__}")
389
+
390
+ # Handle variable-length tuples: Tuple[T, ...]
391
+ if len(args) == 2 and args[1] is Ellipsis:
392
+ for i, item in enumerate(value):
393
+ try:
394
+ type_validator(f"{name}[{i}]", item, args[0])
395
+ except TypeError as e:
396
+ raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e
397
+ # Handle fixed-length tuples: Tuple[T1, T2, ...]
398
+ elif len(args) != len(value):
399
+ raise TypeError(f"Field '{name}' expected a tuple of length {len(args)}, got {len(value)}")
400
+ else:
401
+ for i, (item, expected) in enumerate(zip(value, args)):
402
+ try:
403
+ type_validator(f"{name}[{i}]", item, expected)
404
+ except TypeError as e:
405
+ raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e
406
+
407
+
408
+ def _validate_set(name: str, value: Any, args: Tuple[Any, ...]) -> None:
409
+ """Validate Set[T] type."""
410
+ if not isinstance(value, set):
411
+ raise TypeError(f"Field '{name}' expected a set, got {type(value).__name__}")
412
+
413
+ # Validate each item in the set
414
+ item_type = args[0]
415
+ for i, item in enumerate(value):
416
+ try:
417
+ type_validator(f"{name} item", item, item_type)
418
+ except TypeError as e:
419
+ raise TypeError(f"Invalid item in set '{name}'") from e
420
+
421
+
422
+ def _validate_simple_type(name: str, value: Any, expected_type: type) -> None:
423
+ """Validate simple type (int, str, etc.)."""
424
+ if not isinstance(value, expected_type):
425
+ raise TypeError(
426
+ f"Field '{name}' expected {expected_type.__name__}, got {type(value).__name__} (value: {repr(value)})"
427
+ )
428
+
429
+
430
+ def _create_type_validator(field: Field) -> Validator_T:
431
+ """Create a type validator function for a field."""
432
+ # Hacky: we cannot use a lambda here because of reference issues
433
+
434
+ def validator(value: Any) -> None:
435
+ type_validator(field.name, value, field.type)
436
+
437
+ return validator
438
+
439
+
440
+ def _is_validator(validator: Any) -> bool:
441
+ """Check if a function is a validator.
442
+
443
+ A validator is a Callable that can be called with a single positional argument.
444
+ The validator can have more arguments with default values.
445
+
446
+ Basically, returns True if `validator(value)` is possible.
447
+ """
448
+ if not callable(validator):
449
+ return False
450
+
451
+ signature = inspect.signature(validator)
452
+ parameters = list(signature.parameters.values())
453
+ if len(parameters) == 0:
454
+ return False
455
+ if parameters[0].kind not in (
456
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
457
+ inspect.Parameter.POSITIONAL_ONLY,
458
+ inspect.Parameter.VAR_POSITIONAL,
459
+ ):
460
+ return False
461
+ for parameter in parameters[1:]:
462
+ if parameter.default == inspect.Parameter.empty:
463
+ return False
464
+ return True
465
+
466
+
467
+ _BASIC_TYPE_VALIDATORS = {
468
+ Union: _validate_union,
469
+ Literal: _validate_literal,
470
+ list: _validate_list,
471
+ dict: _validate_dict,
472
+ tuple: _validate_tuple,
473
+ set: _validate_set,
474
+ }
475
+
476
+
477
+ __all__ = [
478
+ "strict",
479
+ "validated_field",
480
+ "Validator_T",
481
+ "StrictDataclassClassValidationError",
482
+ "StrictDataclassDefinitionError",
483
+ "StrictDataclassFieldValidationError",
484
+ ]
venv/lib/python3.13/site-packages/huggingface_hub/errors.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Contains all custom errors."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional, Union
5
+
6
+ from requests import HTTPError, Response
7
+
8
+
9
+ # CACHE ERRORS
10
+
11
+
12
+ class CacheNotFound(Exception):
13
+ """Exception thrown when the Huggingface cache is not found."""
14
+
15
+ cache_dir: Union[str, Path]
16
+
17
+ def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs):
18
+ super().__init__(msg, *args, **kwargs)
19
+ self.cache_dir = cache_dir
20
+
21
+
22
+ class CorruptedCacheException(Exception):
23
+ """Exception for any unexpected structure in the Huggingface cache-system."""
24
+
25
+
26
+ # HEADERS ERRORS
27
+
28
+
29
+ class LocalTokenNotFoundError(EnvironmentError):
30
+ """Raised if local token is required but not found."""
31
+
32
+
33
+ # HTTP ERRORS
34
+
35
+
36
+ class OfflineModeIsEnabled(ConnectionError):
37
+ """Raised when a request is made but `HF_HUB_OFFLINE=1` is set as environment variable."""
38
+
39
+
40
+ class HfHubHTTPError(HTTPError):
41
+ """
42
+ HTTPError to inherit from for any custom HTTP Error raised in HF Hub.
43
+
44
+ Any HTTPError is converted at least into a `HfHubHTTPError`. If some information is
45
+ sent back by the server, it will be added to the error message.
46
+
47
+ Added details:
48
+ - Request id from "X-Request-Id" header if exists. If not, fallback to "X-Amzn-Trace-Id" header if exists.
49
+ - Server error message from the header "X-Error-Message".
50
+ - Server error message if we can found one in the response body.
51
+
52
+ Example:
53
+ ```py
54
+ import requests
55
+ from huggingface_hub.utils import get_session, hf_raise_for_status, HfHubHTTPError
56
+
57
+ response = get_session().post(...)
58
+ try:
59
+ hf_raise_for_status(response)
60
+ except HfHubHTTPError as e:
61
+ print(str(e)) # formatted message
62
+ e.request_id, e.server_message # details returned by server
63
+
64
+ # Complete the error message with additional information once it's raised
65
+ e.append_to_message("\n`create_commit` expects the repository to exist.")
66
+ raise
67
+ ```
68
+ """
69
+
70
+ def __init__(self, message: str, response: Optional[Response] = None, *, server_message: Optional[str] = None):
71
+ self.request_id = (
72
+ response.headers.get("x-request-id") or response.headers.get("X-Amzn-Trace-Id")
73
+ if response is not None
74
+ else None
75
+ )
76
+ self.server_message = server_message
77
+
78
+ super().__init__(
79
+ message,
80
+ response=response, # type: ignore [arg-type]
81
+ request=response.request if response is not None else None, # type: ignore [arg-type]
82
+ )
83
+
84
+ def append_to_message(self, additional_message: str) -> None:
85
+ """Append additional information to the `HfHubHTTPError` initial message."""
86
+ self.args = (self.args[0] + additional_message,) + self.args[1:]
87
+
88
+
89
+ # INFERENCE CLIENT ERRORS
90
+
91
+
92
+ class InferenceTimeoutError(HTTPError, TimeoutError):
93
+ """Error raised when a model is unavailable or the request times out."""
94
+
95
+
96
+ # INFERENCE ENDPOINT ERRORS
97
+
98
+
99
+ class InferenceEndpointError(Exception):
100
+ """Generic exception when dealing with Inference Endpoints."""
101
+
102
+
103
+ class InferenceEndpointTimeoutError(InferenceEndpointError, TimeoutError):
104
+ """Exception for timeouts while waiting for Inference Endpoint."""
105
+
106
+
107
+ # SAFETENSORS ERRORS
108
+
109
+
110
+ class SafetensorsParsingError(Exception):
111
+ """Raised when failing to parse a safetensors file metadata.
112
+
113
+ This can be the case if the file is not a safetensors file or does not respect the specification.
114
+ """
115
+
116
+
117
+ class NotASafetensorsRepoError(Exception):
118
+ """Raised when a repo is not a Safetensors repo i.e. doesn't have either a `model.safetensors` or a
119
+ `model.safetensors.index.json` file.
120
+ """
121
+
122
+
123
+ # TEXT GENERATION ERRORS
124
+
125
+
126
+ class TextGenerationError(HTTPError):
127
+ """Generic error raised if text-generation went wrong."""
128
+
129
+
130
+ # Text Generation Inference Errors
131
+ class ValidationError(TextGenerationError):
132
+ """Server-side validation error."""
133
+
134
+
135
+ class GenerationError(TextGenerationError):
136
+ pass
137
+
138
+
139
+ class OverloadedError(TextGenerationError):
140
+ pass
141
+
142
+
143
+ class IncompleteGenerationError(TextGenerationError):
144
+ pass
145
+
146
+
147
+ class UnknownError(TextGenerationError):
148
+ pass
149
+
150
+
151
+ # VALIDATION ERRORS
152
+
153
+
154
+ class HFValidationError(ValueError):
155
+ """Generic exception thrown by `huggingface_hub` validators.
156
+
157
+ Inherits from [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError).
158
+ """
159
+
160
+
161
+ # FILE METADATA ERRORS
162
+
163
+
164
+ class FileMetadataError(OSError):
165
+ """Error triggered when the metadata of a file on the Hub cannot be retrieved (missing ETag or commit_hash).
166
+
167
+ Inherits from `OSError` for backward compatibility.
168
+ """
169
+
170
+
171
+ # REPOSITORY ERRORS
172
+
173
+
174
+ class RepositoryNotFoundError(HfHubHTTPError):
175
+ """
176
+ Raised when trying to access a hf.co URL with an invalid repository name, or
177
+ with a private repo name the user does not have access to.
178
+
179
+ Example:
180
+
181
+ ```py
182
+ >>> from huggingface_hub import model_info
183
+ >>> model_info("<non_existent_repository>")
184
+ (...)
185
+ huggingface_hub.utils._errors.RepositoryNotFoundError: 401 Client Error. (Request ID: PvMw_VjBMjVdMz53WKIzP)
186
+
187
+ Repository Not Found for url: https://huggingface.co/api/models/%3Cnon_existent_repository%3E.
188
+ Please make sure you specified the correct `repo_id` and `repo_type`.
189
+ If the repo is private, make sure you are authenticated.
190
+ Invalid username or password.
191
+ ```
192
+ """
193
+
194
+
195
+ class GatedRepoError(RepositoryNotFoundError):
196
+ """
197
+ Raised when trying to access a gated repository for which the user is not on the
198
+ authorized list.
199
+
200
+ Note: derives from `RepositoryNotFoundError` to ensure backward compatibility.
201
+
202
+ Example:
203
+
204
+ ```py
205
+ >>> from huggingface_hub import model_info
206
+ >>> model_info("<gated_repository>")
207
+ (...)
208
+ huggingface_hub.utils._errors.GatedRepoError: 403 Client Error. (Request ID: ViT1Bf7O_026LGSQuVqfa)
209
+
210
+ Cannot access gated repo for url https://huggingface.co/api/models/ardent-figment/gated-model.
211
+ Access to model ardent-figment/gated-model is restricted and you are not in the authorized list.
212
+ Visit https://huggingface.co/ardent-figment/gated-model to ask for access.
213
+ ```
214
+ """
215
+
216
+
217
+ class DisabledRepoError(HfHubHTTPError):
218
+ """
219
+ Raised when trying to access a repository that has been disabled by its author.
220
+
221
+ Example:
222
+
223
+ ```py
224
+ >>> from huggingface_hub import dataset_info
225
+ >>> dataset_info("laion/laion-art")
226
+ (...)
227
+ huggingface_hub.utils._errors.DisabledRepoError: 403 Client Error. (Request ID: Root=1-659fc3fa-3031673e0f92c71a2260dbe2;bc6f4dfb-b30a-4862-af0a-5cfe827610d8)
228
+
229
+ Cannot access repository for url https://huggingface.co/api/datasets/laion/laion-art.
230
+ Access to this resource is disabled.
231
+ ```
232
+ """
233
+
234
+
235
+ # REVISION ERROR
236
+
237
+
238
+ class RevisionNotFoundError(HfHubHTTPError):
239
+ """
240
+ Raised when trying to access a hf.co URL with a valid repository but an invalid
241
+ revision.
242
+
243
+ Example:
244
+
245
+ ```py
246
+ >>> from huggingface_hub import hf_hub_download
247
+ >>> hf_hub_download('bert-base-cased', 'config.json', revision='<non-existent-revision>')
248
+ (...)
249
+ huggingface_hub.utils._errors.RevisionNotFoundError: 404 Client Error. (Request ID: Mwhe_c3Kt650GcdKEFomX)
250
+
251
+ Revision Not Found for url: https://huggingface.co/bert-base-cased/resolve/%3Cnon-existent-revision%3E/config.json.
252
+ ```
253
+ """
254
+
255
+
256
+ # ENTRY ERRORS
257
+ class EntryNotFoundError(HfHubHTTPError):
258
+ """
259
+ Raised when trying to access a hf.co URL with a valid repository and revision
260
+ but an invalid filename.
261
+
262
+ Example:
263
+
264
+ ```py
265
+ >>> from huggingface_hub import hf_hub_download
266
+ >>> hf_hub_download('bert-base-cased', '<non-existent-file>')
267
+ (...)
268
+ huggingface_hub.utils._errors.EntryNotFoundError: 404 Client Error. (Request ID: 53pNl6M0MxsnG5Sw8JA6x)
269
+
270
+ Entry Not Found for url: https://huggingface.co/bert-base-cased/resolve/main/%3Cnon-existent-file%3E.
271
+ ```
272
+ """
273
+
274
+
275
+ class LocalEntryNotFoundError(EntryNotFoundError, FileNotFoundError, ValueError):
276
+ """
277
+ Raised when trying to access a file or snapshot that is not on the disk when network is
278
+ disabled or unavailable (connection issue). The entry may exist on the Hub.
279
+
280
+ Note: `ValueError` type is to ensure backward compatibility.
281
+ Note: `LocalEntryNotFoundError` derives from `HTTPError` because of `EntryNotFoundError`
282
+ even when it is not a network issue.
283
+
284
+ Example:
285
+
286
+ ```py
287
+ >>> from huggingface_hub import hf_hub_download
288
+ >>> hf_hub_download('bert-base-cased', '<non-cached-file>', local_files_only=True)
289
+ (...)
290
+ huggingface_hub.utils._errors.LocalEntryNotFoundError: Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable hf.co look-ups and downloads online, set 'local_files_only' to False.
291
+ ```
292
+ """
293
+
294
+ def __init__(self, message: str):
295
+ super().__init__(message, response=None)
296
+
297
+
298
+ # REQUEST ERROR
299
+ class BadRequestError(HfHubHTTPError, ValueError):
300
+ """
301
+ Raised by `hf_raise_for_status` when the server returns a HTTP 400 error.
302
+
303
+ Example:
304
+
305
+ ```py
306
+ >>> resp = requests.post("hf.co/api/check", ...)
307
+ >>> hf_raise_for_status(resp, endpoint_name="check")
308
+ huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX)
309
+ ```
310
+ """
311
+
312
+
313
+ # DDUF file format ERROR
314
+
315
+
316
+ class DDUFError(Exception):
317
+ """Base exception for errors related to the DDUF format."""
318
+
319
+
320
+ class DDUFCorruptedFileError(DDUFError):
321
+ """Exception thrown when the DDUF file is corrupted."""
322
+
323
+
324
+ class DDUFExportError(DDUFError):
325
+ """Base exception for errors during DDUF export."""
326
+
327
+
328
+ class DDUFInvalidEntryNameError(DDUFExportError):
329
+ """Exception thrown when the entry name is invalid."""
330
+
331
+
332
+ # STRICT DATACLASSES ERRORS
333
+
334
+
335
+ class StrictDataclassError(Exception):
336
+ """Base exception for strict dataclasses."""
337
+
338
+
339
+ class StrictDataclassDefinitionError(StrictDataclassError):
340
+ """Exception thrown when a strict dataclass is defined incorrectly."""
341
+
342
+
343
+ class StrictDataclassFieldValidationError(StrictDataclassError):
344
+ """Exception thrown when a strict dataclass fails validation for a given field."""
345
+
346
+ def __init__(self, field: str, cause: Exception):
347
+ error_message = f"Validation error for field '{field}':"
348
+ error_message += f"\n {cause.__class__.__name__}: {cause}"
349
+ super().__init__(error_message)
350
+
351
+
352
+ class StrictDataclassClassValidationError(StrictDataclassError):
353
+ """Exception thrown when a strict dataclass fails validation on a class validator."""
354
+
355
+ def __init__(self, validator: str, cause: Exception):
356
+ error_message = f"Class validation error for validator '{validator}':"
357
+ error_message += f"\n {cause.__class__.__name__}: {cause}"
358
+ super().__init__(error_message)
359
+
360
+
361
+ # XET ERRORS
362
+
363
+
364
+ class XetError(Exception):
365
+ """Base exception for errors related to Xet Storage."""
366
+
367
+
368
+ class XetAuthorizationError(XetError):
369
+ """Exception thrown when the user does not have the right authorization to use Xet Storage."""
370
+
371
+
372
+ class XetRefreshTokenError(XetError):
373
+ """Exception thrown when the refresh token is invalid."""
374
+
375
+
376
+ class XetDownloadError(Exception):
377
+ """Exception thrown when the download from Xet Storage fails."""
venv/lib/python3.13/site-packages/huggingface_hub/fastai_utils.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from pickle import DEFAULT_PROTOCOL, PicklingError
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ from packaging import version
8
+
9
+ from huggingface_hub import constants, snapshot_download
10
+ from huggingface_hub.hf_api import HfApi
11
+ from huggingface_hub.utils import (
12
+ SoftTemporaryDirectory,
13
+ get_fastai_version,
14
+ get_fastcore_version,
15
+ get_python_version,
16
+ )
17
+
18
+ from .utils import logging, validate_hf_hub_args
19
+ from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility...
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def _check_fastai_fastcore_versions(
26
+ fastai_min_version: str = "2.4",
27
+ fastcore_min_version: str = "1.3.27",
28
+ ):
29
+ """
30
+ Checks that the installed fastai and fastcore versions are compatible for pickle serialization.
31
+
32
+ Args:
33
+ fastai_min_version (`str`, *optional*):
34
+ The minimum fastai version supported.
35
+ fastcore_min_version (`str`, *optional*):
36
+ The minimum fastcore version supported.
37
+
38
+ > [!TIP]
39
+ > Raises the following error:
40
+ >
41
+ > - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
42
+ > if the fastai or fastcore libraries are not available or are of an invalid version.
43
+ """
44
+
45
+ if (get_fastcore_version() or get_fastai_version()) == "N/A":
46
+ raise ImportError(
47
+ f"fastai>={fastai_min_version} and fastcore>={fastcore_min_version} are"
48
+ f" required. Currently using fastai=={get_fastai_version()} and"
49
+ f" fastcore=={get_fastcore_version()}."
50
+ )
51
+
52
+ current_fastai_version = version.Version(get_fastai_version())
53
+ current_fastcore_version = version.Version(get_fastcore_version())
54
+
55
+ if current_fastai_version < version.Version(fastai_min_version):
56
+ raise ImportError(
57
+ "`push_to_hub_fastai` and `from_pretrained_fastai` require a"
58
+ f" fastai>={fastai_min_version} version, but you are using fastai version"
59
+ f" {get_fastai_version()} which is incompatible. Upgrade with `pip install"
60
+ " fastai==2.5.6`."
61
+ )
62
+
63
+ if current_fastcore_version < version.Version(fastcore_min_version):
64
+ raise ImportError(
65
+ "`push_to_hub_fastai` and `from_pretrained_fastai` require a"
66
+ f" fastcore>={fastcore_min_version} version, but you are using fastcore"
67
+ f" version {get_fastcore_version()} which is incompatible. Upgrade with"
68
+ " `pip install fastcore==1.3.27`."
69
+ )
70
+
71
+
72
+ def _check_fastai_fastcore_pyproject_versions(
73
+ storage_folder: str,
74
+ fastai_min_version: str = "2.4",
75
+ fastcore_min_version: str = "1.3.27",
76
+ ):
77
+ """
78
+ Checks that the `pyproject.toml` file in the directory `storage_folder` has fastai and fastcore versions
79
+ that are compatible with `from_pretrained_fastai` and `push_to_hub_fastai`. If `pyproject.toml` does not exist
80
+ or does not contain versions for fastai and fastcore, then it logs a warning.
81
+
82
+ Args:
83
+ storage_folder (`str`):
84
+ Folder to look for the `pyproject.toml` file.
85
+ fastai_min_version (`str`, *optional*):
86
+ The minimum fastai version supported.
87
+ fastcore_min_version (`str`, *optional*):
88
+ The minimum fastcore version supported.
89
+
90
+ > [!TIP]
91
+ > Raises the following errors:
92
+ >
93
+ > - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
94
+ > if the `toml` module is not installed.
95
+ > - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
96
+ > if the `pyproject.toml` indicates a lower than minimum supported version of fastai or fastcore.
97
+ """
98
+
99
+ try:
100
+ import toml
101
+ except ModuleNotFoundError:
102
+ raise ImportError(
103
+ "`push_to_hub_fastai` and `from_pretrained_fastai` require the toml module."
104
+ " Install it with `pip install toml`."
105
+ )
106
+
107
+ # Checks that a `pyproject.toml`, with `build-system` and `requires` sections, exists in the repository. If so, get a list of required packages.
108
+ if not os.path.isfile(f"{storage_folder}/pyproject.toml"):
109
+ logger.warning(
110
+ "There is no `pyproject.toml` in the repository that contains the fastai"
111
+ " `Learner`. The `pyproject.toml` would allow us to verify that your fastai"
112
+ " and fastcore versions are compatible with those of the model you want to"
113
+ " load."
114
+ )
115
+ return
116
+ pyproject_toml = toml.load(f"{storage_folder}/pyproject.toml")
117
+
118
+ if "build-system" not in pyproject_toml.keys():
119
+ logger.warning(
120
+ "There is no `build-system` section in the pyproject.toml of the repository"
121
+ " that contains the fastai `Learner`. The `build-system` would allow us to"
122
+ " verify that your fastai and fastcore versions are compatible with those"
123
+ " of the model you want to load."
124
+ )
125
+ return
126
+ build_system_toml = pyproject_toml["build-system"]
127
+
128
+ if "requires" not in build_system_toml.keys():
129
+ logger.warning(
130
+ "There is no `requires` section in the pyproject.toml of the repository"
131
+ " that contains the fastai `Learner`. The `requires` would allow us to"
132
+ " verify that your fastai and fastcore versions are compatible with those"
133
+ " of the model you want to load."
134
+ )
135
+ return
136
+ package_versions = build_system_toml["requires"]
137
+
138
+ # Extracts contains fastai and fastcore versions from `pyproject.toml` if available.
139
+ # If the package is specified but not the version (e.g. "fastai" instead of "fastai=2.4"), the default versions are the highest.
140
+ fastai_packages = [pck for pck in package_versions if pck.startswith("fastai")]
141
+ if len(fastai_packages) == 0:
142
+ logger.warning("The repository does not have a fastai version specified in the `pyproject.toml`.")
143
+ # fastai_version is an empty string if not specified
144
+ else:
145
+ fastai_version = str(fastai_packages[0]).partition("=")[2]
146
+ if fastai_version != "" and version.Version(fastai_version) < version.Version(fastai_min_version):
147
+ raise ImportError(
148
+ "`from_pretrained_fastai` requires"
149
+ f" fastai>={fastai_min_version} version but the model to load uses"
150
+ f" {fastai_version} which is incompatible."
151
+ )
152
+
153
+ fastcore_packages = [pck for pck in package_versions if pck.startswith("fastcore")]
154
+ if len(fastcore_packages) == 0:
155
+ logger.warning("The repository does not have a fastcore version specified in the `pyproject.toml`.")
156
+ # fastcore_version is an empty string if not specified
157
+ else:
158
+ fastcore_version = str(fastcore_packages[0]).partition("=")[2]
159
+ if fastcore_version != "" and version.Version(fastcore_version) < version.Version(fastcore_min_version):
160
+ raise ImportError(
161
+ "`from_pretrained_fastai` requires"
162
+ f" fastcore>={fastcore_min_version} version, but you are using fastcore"
163
+ f" version {fastcore_version} which is incompatible."
164
+ )
165
+
166
+
167
+ README_TEMPLATE = """---
168
+ tags:
169
+ - fastai
170
+ ---
171
+
172
+ # Amazing!
173
+
174
+ 🥳 Congratulations on hosting your fastai model on the Hugging Face Hub!
175
+
176
+ # Some next steps
177
+ 1. Fill out this model card with more information (see the template below and the [documentation here](https://huggingface.co/docs/hub/model-repos))!
178
+
179
+ 2. Create a demo in Gradio or Streamlit using 🤗 Spaces ([documentation here](https://huggingface.co/docs/hub/spaces)).
180
+
181
+ 3. Join the fastai community on the [Fastai Discord](https://discord.com/invite/YKrxeNn)!
182
+
183
+ Greetings fellow fastlearner 🤝! Don't forget to delete this content from your model card.
184
+
185
+
186
+ ---
187
+
188
+
189
+ # Model card
190
+
191
+ ## Model description
192
+ More information needed
193
+
194
+ ## Intended uses & limitations
195
+ More information needed
196
+
197
+ ## Training and evaluation data
198
+ More information needed
199
+ """
200
+
201
+ PYPROJECT_TEMPLATE = f"""[build-system]
202
+ requires = ["setuptools>=40.8.0", "wheel", "python={get_python_version()}", "fastai={get_fastai_version()}", "fastcore={get_fastcore_version()}"]
203
+ build-backend = "setuptools.build_meta:__legacy__"
204
+ """
205
+
206
+
207
+ def _create_model_card(repo_dir: Path):
208
+ """
209
+ Creates a model card for the repository.
210
+
211
+ Args:
212
+ repo_dir (`Path`):
213
+ Directory where model card is created.
214
+ """
215
+ readme_path = repo_dir / "README.md"
216
+
217
+ if not readme_path.exists():
218
+ with readme_path.open("w", encoding="utf-8") as f:
219
+ f.write(README_TEMPLATE)
220
+
221
+
222
+ def _create_model_pyproject(repo_dir: Path):
223
+ """
224
+ Creates a `pyproject.toml` for the repository.
225
+
226
+ Args:
227
+ repo_dir (`Path`):
228
+ Directory where `pyproject.toml` is created.
229
+ """
230
+ pyproject_path = repo_dir / "pyproject.toml"
231
+
232
+ if not pyproject_path.exists():
233
+ with pyproject_path.open("w", encoding="utf-8") as f:
234
+ f.write(PYPROJECT_TEMPLATE)
235
+
236
+
237
+ def _save_pretrained_fastai(
238
+ learner,
239
+ save_directory: Union[str, Path],
240
+ config: Optional[Dict[str, Any]] = None,
241
+ ):
242
+ """
243
+ Saves a fastai learner to `save_directory` in pickle format using the default pickle protocol for the version of python used.
244
+
245
+ Args:
246
+ learner (`Learner`):
247
+ The `fastai.Learner` you'd like to save.
248
+ save_directory (`str` or `Path`):
249
+ Specific directory in which you want to save the fastai learner.
250
+ config (`dict`, *optional*):
251
+ Configuration object. Will be uploaded as a .json file. Example: 'https://huggingface.co/espejelomar/fastai-pet-breeds-classification/blob/main/config.json'.
252
+
253
+ > [!TIP]
254
+ > Raises the following error:
255
+ >
256
+ > - [`RuntimeError`](https://docs.python.org/3/library/exceptions.html#RuntimeError)
257
+ > if the config file provided is not a dictionary.
258
+ """
259
+ _check_fastai_fastcore_versions()
260
+
261
+ os.makedirs(save_directory, exist_ok=True)
262
+
263
+ # if the user provides config then we update it with the fastai and fastcore versions in CONFIG_TEMPLATE.
264
+ if config is not None:
265
+ if not isinstance(config, dict):
266
+ raise RuntimeError(f"Provided config should be a dict. Got: '{type(config)}'")
267
+ path = os.path.join(save_directory, constants.CONFIG_NAME)
268
+ with open(path, "w") as f:
269
+ json.dump(config, f)
270
+
271
+ _create_model_card(Path(save_directory))
272
+ _create_model_pyproject(Path(save_directory))
273
+
274
+ # learner.export saves the model in `self.path`.
275
+ learner.path = Path(save_directory)
276
+ os.makedirs(save_directory, exist_ok=True)
277
+ try:
278
+ learner.export(
279
+ fname="model.pkl",
280
+ pickle_protocol=DEFAULT_PROTOCOL,
281
+ )
282
+ except PicklingError:
283
+ raise PicklingError(
284
+ "You are using a lambda function, i.e., an anonymous function. `pickle`"
285
+ " cannot pickle function objects and requires that all functions have"
286
+ " names. One possible solution is to name the function."
287
+ )
288
+
289
+
290
+ @validate_hf_hub_args
291
+ def from_pretrained_fastai(
292
+ repo_id: str,
293
+ revision: Optional[str] = None,
294
+ ):
295
+ """
296
+ Load pretrained fastai model from the Hub or from a local directory.
297
+
298
+ Args:
299
+ repo_id (`str`):
300
+ The location where the pickled fastai.Learner is. It can be either of the two:
301
+ - Hosted on the Hugging Face Hub. E.g.: 'espejelomar/fatai-pet-breeds-classification' or 'distilgpt2'.
302
+ You can add a `revision` by appending `@` at the end of `repo_id`. E.g.: `dbmdz/bert-base-german-cased@main`.
303
+ Revision is the specific model version to use. Since we use a git-based system for storing models and other
304
+ artifacts on the Hugging Face Hub, it can be a branch name, a tag name, or a commit id.
305
+ - Hosted locally. `repo_id` would be a directory containing the pickle and a pyproject.toml
306
+ indicating the fastai and fastcore versions used to build the `fastai.Learner`. E.g.: `./my_model_directory/`.
307
+ revision (`str`, *optional*):
308
+ Revision at which the repo's files are downloaded. See documentation of `snapshot_download`.
309
+
310
+ Returns:
311
+ The `fastai.Learner` model in the `repo_id` repo.
312
+ """
313
+ _check_fastai_fastcore_versions()
314
+
315
+ # Load the `repo_id` repo.
316
+ # `snapshot_download` returns the folder where the model was stored.
317
+ # `cache_dir` will be the default '/root/.cache/huggingface/hub'
318
+ if not os.path.isdir(repo_id):
319
+ storage_folder = snapshot_download(
320
+ repo_id=repo_id,
321
+ revision=revision,
322
+ library_name="fastai",
323
+ library_version=get_fastai_version(),
324
+ )
325
+ else:
326
+ storage_folder = repo_id
327
+
328
+ _check_fastai_fastcore_pyproject_versions(storage_folder)
329
+
330
+ from fastai.learner import load_learner # type: ignore
331
+
332
+ return load_learner(os.path.join(storage_folder, "model.pkl"))
333
+
334
+
335
+ @validate_hf_hub_args
336
+ def push_to_hub_fastai(
337
+ learner,
338
+ *,
339
+ repo_id: str,
340
+ commit_message: str = "Push FastAI model using huggingface_hub.",
341
+ private: Optional[bool] = None,
342
+ token: Optional[str] = None,
343
+ config: Optional[dict] = None,
344
+ branch: Optional[str] = None,
345
+ create_pr: Optional[bool] = None,
346
+ allow_patterns: Optional[Union[List[str], str]] = None,
347
+ ignore_patterns: Optional[Union[List[str], str]] = None,
348
+ delete_patterns: Optional[Union[List[str], str]] = None,
349
+ api_endpoint: Optional[str] = None,
350
+ ):
351
+ """
352
+ Upload learner checkpoint files to the Hub.
353
+
354
+ Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
355
+ `delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
356
+ details.
357
+
358
+ Args:
359
+ learner (`Learner`):
360
+ The `fastai.Learner' you'd like to push to the Hub.
361
+ repo_id (`str`):
362
+ The repository id for your model in Hub in the format of "namespace/repo_name". The namespace can be your individual account or an organization to which you have write access (for example, 'stanfordnlp/stanza-de').
363
+ commit_message (`str`, *optional*):
364
+ Message to commit while pushing. Will default to :obj:`"add model"`.
365
+ private (`bool`, *optional*):
366
+ Whether or not the repository created should be private.
367
+ If `None` (default), will default to been public except if the organization's default is private.
368
+ token (`str`, *optional*):
369
+ The Hugging Face account token to use as HTTP bearer authorization for remote files. If :obj:`None`, the token will be asked by a prompt.
370
+ config (`dict`, *optional*):
371
+ Configuration object to be saved alongside the model weights.
372
+ branch (`str`, *optional*):
373
+ The git branch on which to push the model. This defaults to
374
+ the default branch as specified in your repository, which
375
+ defaults to `"main"`.
376
+ create_pr (`boolean`, *optional*):
377
+ Whether or not to create a Pull Request from `branch` with that commit.
378
+ Defaults to `False`.
379
+ api_endpoint (`str`, *optional*):
380
+ The API endpoint to use when pushing the model to the hub.
381
+ allow_patterns (`List[str]` or `str`, *optional*):
382
+ If provided, only files matching at least one pattern are pushed.
383
+ ignore_patterns (`List[str]` or `str`, *optional*):
384
+ If provided, files matching any of the patterns are not pushed.
385
+ delete_patterns (`List[str]` or `str`, *optional*):
386
+ If provided, remote files matching any of the patterns will be deleted from the repo.
387
+
388
+ Returns:
389
+ The url of the commit of your model in the given repository.
390
+
391
+ > [!TIP]
392
+ > Raises the following error:
393
+ >
394
+ > - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
395
+ > if the user is not log on to the Hugging Face Hub.
396
+ """
397
+ _check_fastai_fastcore_versions()
398
+ api = HfApi(endpoint=api_endpoint)
399
+ repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
400
+
401
+ # Push the files to the repo in a single commit
402
+ with SoftTemporaryDirectory() as tmp:
403
+ saved_path = Path(tmp) / repo_id
404
+ _save_pretrained_fastai(learner, saved_path, config=config)
405
+ return api.upload_folder(
406
+ repo_id=repo_id,
407
+ token=token,
408
+ folder_path=saved_path,
409
+ commit_message=commit_message,
410
+ revision=branch,
411
+ create_pr=create_pr,
412
+ allow_patterns=allow_patterns,
413
+ ignore_patterns=ignore_patterns,
414
+ delete_patterns=delete_patterns,
415
+ )
venv/lib/python3.13/site-packages/huggingface_hub/file_download.py ADDED
@@ -0,0 +1,1813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import errno
3
+ import inspect
4
+ import os
5
+ import re
6
+ import shutil
7
+ import stat
8
+ import time
9
+ import uuid
10
+ import warnings
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
14
+ from urllib.parse import quote, urlparse
15
+
16
+ import requests
17
+
18
+ from . import (
19
+ __version__, # noqa: F401 # for backward compatibility
20
+ constants,
21
+ )
22
+ from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
23
+ from .constants import (
24
+ HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
25
+ HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
26
+ )
27
+ from .errors import (
28
+ EntryNotFoundError,
29
+ FileMetadataError,
30
+ GatedRepoError,
31
+ HfHubHTTPError,
32
+ LocalEntryNotFoundError,
33
+ RepositoryNotFoundError,
34
+ RevisionNotFoundError,
35
+ )
36
+ from .utils import (
37
+ OfflineModeIsEnabled,
38
+ SoftTemporaryDirectory,
39
+ WeakFileLock,
40
+ XetFileData,
41
+ build_hf_headers,
42
+ get_fastai_version, # noqa: F401 # for backward compatibility
43
+ get_fastcore_version, # noqa: F401 # for backward compatibility
44
+ get_graphviz_version, # noqa: F401 # for backward compatibility
45
+ get_jinja_version, # noqa: F401 # for backward compatibility
46
+ get_pydot_version, # noqa: F401 # for backward compatibility
47
+ get_tf_version, # noqa: F401 # for backward compatibility
48
+ get_torch_version, # noqa: F401 # for backward compatibility
49
+ hf_raise_for_status,
50
+ is_fastai_available, # noqa: F401 # for backward compatibility
51
+ is_fastcore_available, # noqa: F401 # for backward compatibility
52
+ is_graphviz_available, # noqa: F401 # for backward compatibility
53
+ is_jinja_available, # noqa: F401 # for backward compatibility
54
+ is_pydot_available, # noqa: F401 # for backward compatibility
55
+ is_tf_available, # noqa: F401 # for backward compatibility
56
+ is_torch_available, # noqa: F401 # for backward compatibility
57
+ logging,
58
+ parse_xet_file_data_from_response,
59
+ refresh_xet_connection_info,
60
+ reset_sessions,
61
+ tqdm,
62
+ validate_hf_hub_args,
63
+ )
64
+ from .utils._http import _adjust_range_header, http_backoff
65
+ from .utils._runtime import _PY_VERSION, is_xet_available # noqa: F401 # for backward compatibility
66
+ from .utils._typing import HTTP_METHOD_T
67
+ from .utils.sha import sha_fileobj
68
+ from .utils.tqdm import _get_progress_bar_context
69
+
70
+
71
+ logger = logging.get_logger(__name__)
72
+
73
+ # Return value when trying to load a file from cache but the file does not exist in the distant repo.
74
+ _CACHED_NO_EXIST = object()
75
+ _CACHED_NO_EXIST_T = Any
76
+
77
+ # Regex to get filename from a "Content-Disposition" header for CDN-served files
78
+ HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
79
+
80
+ # Regex to check if the revision IS directly a commit_hash
81
+ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
82
+
83
+ # Regex to check if the file etag IS a valid sha256
84
+ REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
85
+
86
+ _are_symlinks_supported_in_dir: Dict[str, bool] = {}
87
+
88
+
89
+ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
90
+ """Return whether the symlinks are supported on the machine.
91
+
92
+ Since symlinks support can change depending on the mounted disk, we need to check
93
+ on the precise cache folder. By default, the default HF cache directory is checked.
94
+
95
+ Args:
96
+ cache_dir (`str`, `Path`, *optional*):
97
+ Path to the folder where cached files are stored.
98
+
99
+ Returns: [bool] Whether symlinks are supported in the directory.
100
+ """
101
+ # Defaults to HF cache
102
+ if cache_dir is None:
103
+ cache_dir = constants.HF_HUB_CACHE
104
+ cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
105
+
106
+ # Check symlink compatibility only once (per cache directory) at first time use
107
+ if cache_dir not in _are_symlinks_supported_in_dir:
108
+ _are_symlinks_supported_in_dir[cache_dir] = True
109
+
110
+ os.makedirs(cache_dir, exist_ok=True)
111
+ with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
112
+ src_path = Path(tmpdir) / "dummy_file_src"
113
+ src_path.touch()
114
+ dst_path = Path(tmpdir) / "dummy_file_dst"
115
+
116
+ # Relative source path as in `_create_symlink``
117
+ relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
118
+ try:
119
+ os.symlink(relative_src, dst_path)
120
+ except OSError:
121
+ # Likely running on Windows
122
+ _are_symlinks_supported_in_dir[cache_dir] = False
123
+
124
+ if not constants.HF_HUB_DISABLE_SYMLINKS_WARNING:
125
+ message = (
126
+ "`huggingface_hub` cache-system uses symlinks by default to"
127
+ " efficiently store duplicated files but your machine does not"
128
+ f" support them in {cache_dir}. Caching files will still work"
129
+ " but in a degraded version that might require more space on"
130
+ " your disk. This warning can be disabled by setting the"
131
+ " `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For"
132
+ " more details, see"
133
+ " https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations."
134
+ )
135
+ if os.name == "nt":
136
+ message += (
137
+ "\nTo support symlinks on Windows, you either need to"
138
+ " activate Developer Mode or to run Python as an"
139
+ " administrator. In order to activate developer mode,"
140
+ " see this article:"
141
+ " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
142
+ )
143
+ warnings.warn(message)
144
+
145
+ return _are_symlinks_supported_in_dir[cache_dir]
146
+
147
+
148
+ @dataclass(frozen=True)
149
+ class HfFileMetadata:
150
+ """Data structure containing information about a file versioned on the Hub.
151
+
152
+ Returned by [`get_hf_file_metadata`] based on a URL.
153
+
154
+ Args:
155
+ commit_hash (`str`, *optional*):
156
+ The commit_hash related to the file.
157
+ etag (`str`, *optional*):
158
+ Etag of the file on the server.
159
+ location (`str`):
160
+ Location where to download the file. Can be a Hub url or not (CDN).
161
+ size (`size`):
162
+ Size of the file. In case of an LFS file, contains the size of the actual
163
+ LFS file, not the pointer.
164
+ xet_file_data (`XetFileData`, *optional*):
165
+ Xet information for the file. This is only set if the file is stored using Xet storage.
166
+ """
167
+
168
+ commit_hash: Optional[str]
169
+ etag: Optional[str]
170
+ location: str
171
+ size: Optional[int]
172
+ xet_file_data: Optional[XetFileData]
173
+
174
+
175
+ @validate_hf_hub_args
176
+ def hf_hub_url(
177
+ repo_id: str,
178
+ filename: str,
179
+ *,
180
+ subfolder: Optional[str] = None,
181
+ repo_type: Optional[str] = None,
182
+ revision: Optional[str] = None,
183
+ endpoint: Optional[str] = None,
184
+ ) -> str:
185
+ """Construct the URL of a file from the given information.
186
+
187
+ The resolved address can either be a huggingface.co-hosted url, or a link to
188
+ Cloudfront (a Content Delivery Network, or CDN) for large files which are
189
+ more than a few MBs.
190
+
191
+ Args:
192
+ repo_id (`str`):
193
+ A namespace (user or an organization) name and a repo name separated
194
+ by a `/`.
195
+ filename (`str`):
196
+ The name of the file in the repo.
197
+ subfolder (`str`, *optional*):
198
+ An optional value corresponding to a folder inside the repo.
199
+ repo_type (`str`, *optional*):
200
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
201
+ `None` or `"model"` if downloading from a model. Default is `None`.
202
+ revision (`str`, *optional*):
203
+ An optional Git revision id which can be a branch name, a tag, or a
204
+ commit hash.
205
+
206
+ Example:
207
+
208
+ ```python
209
+ >>> from huggingface_hub import hf_hub_url
210
+
211
+ >>> hf_hub_url(
212
+ ... repo_id="julien-c/EsperBERTo-small", filename="pytorch_model.bin"
213
+ ... )
214
+ 'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
215
+ ```
216
+
217
+ > [!TIP]
218
+ > Notes:
219
+ >
220
+ > Cloudfront is replicated over the globe so downloads are way faster for
221
+ > the end user (and it also lowers our bandwidth costs).
222
+ >
223
+ > Cloudfront aggressively caches files by default (default TTL is 24
224
+ > hours), however this is not an issue here because we implement a
225
+ > git-based versioning system on huggingface.co, which means that we store
226
+ > the files on S3/Cloudfront in a content-addressable way (i.e., the file
227
+ > name is its hash). Using content-addressable filenames means cache can't
228
+ > ever be stale.
229
+ >
230
+ > In terms of client-side caching from this library, we base our caching
231
+ > on the objects' entity tag (`ETag`), which is an identifier of a
232
+ > specific version of a resource [1]_. An object's ETag is: its git-sha1
233
+ > if stored in git, or its sha256 if stored in git-lfs.
234
+
235
+ References:
236
+
237
+ - [1] https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
238
+ """
239
+ if subfolder == "":
240
+ subfolder = None
241
+ if subfolder is not None:
242
+ filename = f"{subfolder}/{filename}"
243
+
244
+ if repo_type not in constants.REPO_TYPES:
245
+ raise ValueError("Invalid repo type")
246
+
247
+ if repo_type in constants.REPO_TYPES_URL_PREFIXES:
248
+ repo_id = constants.REPO_TYPES_URL_PREFIXES[repo_type] + repo_id
249
+
250
+ if revision is None:
251
+ revision = constants.DEFAULT_REVISION
252
+ url = HUGGINGFACE_CO_URL_TEMPLATE.format(
253
+ repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
254
+ )
255
+ # Update endpoint if provided
256
+ if endpoint is not None and url.startswith(constants.ENDPOINT):
257
+ url = endpoint + url[len(constants.ENDPOINT) :]
258
+ return url
259
+
260
+
261
+ def _request_wrapper(
262
+ method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
263
+ ) -> requests.Response:
264
+ """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
265
+ `allow_redirection=False`.
266
+
267
+ A backoff mechanism retries the HTTP call on 5xx errors and network errors.
268
+
269
+ Args:
270
+ method (`str`):
271
+ HTTP method, such as 'GET' or 'HEAD'.
272
+ url (`str`):
273
+ The URL of the resource to fetch.
274
+ follow_relative_redirects (`bool`, *optional*, defaults to `False`)
275
+ If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
276
+ kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
277
+ following redirection to a CDN.
278
+ **params (`dict`, *optional*):
279
+ Params to pass to `requests.request`.
280
+ """
281
+ # Recursively follow relative redirects
282
+ if follow_relative_redirects:
283
+ response = _request_wrapper(
284
+ method=method,
285
+ url=url,
286
+ follow_relative_redirects=False,
287
+ **params,
288
+ )
289
+
290
+ # If redirection, we redirect only relative paths.
291
+ # This is useful in case of a renamed repository.
292
+ if 300 <= response.status_code <= 399:
293
+ parsed_target = urlparse(response.headers["Location"])
294
+ if parsed_target.netloc == "":
295
+ # This means it is a relative 'location' headers, as allowed by RFC 7231.
296
+ # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
297
+ # We want to follow this relative redirect !
298
+ #
299
+ # Highly inspired by `resolve_redirects` from requests library.
300
+ # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
301
+ next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
302
+ return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
303
+ return response
304
+
305
+ # Perform request and return if status_code is not in the retry list.
306
+ response = http_backoff(method=method, url=url, **params)
307
+ hf_raise_for_status(response)
308
+ return response
309
+
310
+
311
+ def _get_file_length_from_http_response(response: requests.Response) -> Optional[int]:
312
+ """
313
+ Get the length of the file from the HTTP response headers.
314
+
315
+ This function extracts the file size from the HTTP response headers, either from the
316
+ `Content-Range` or `Content-Length` header, if available (in that order).
317
+
318
+ Args:
319
+ response (`requests.Response`):
320
+ The HTTP response object.
321
+
322
+ Returns:
323
+ `int` or `None`: The length of the file in bytes, or None if not available.
324
+ """
325
+
326
+ # If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
327
+ # contain the length of the compressed body, not the uncompressed file size.
328
+ # And at the start of transmission there's no way to know the uncompressed file size for gzip,
329
+ # thus we return None in that case.
330
+ content_encoding = response.headers.get("Content-Encoding", "identity").lower()
331
+ if content_encoding != "identity":
332
+ # gzip/br/deflate/zstd etc
333
+ return None
334
+
335
+ content_range = response.headers.get("Content-Range")
336
+ if content_range is not None:
337
+ return int(content_range.rsplit("/")[-1])
338
+
339
+ content_length = response.headers.get("Content-Length")
340
+ if content_length is not None:
341
+ return int(content_length)
342
+
343
+ return None
344
+
345
+
346
+ def http_get(
347
+ url: str,
348
+ temp_file: BinaryIO,
349
+ *,
350
+ proxies: Optional[Dict] = None,
351
+ resume_size: int = 0,
352
+ headers: Optional[Dict[str, Any]] = None,
353
+ expected_size: Optional[int] = None,
354
+ displayed_filename: Optional[str] = None,
355
+ _nb_retries: int = 5,
356
+ _tqdm_bar: Optional[tqdm] = None,
357
+ ) -> None:
358
+ """
359
+ Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
360
+
361
+ If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
362
+ transient error (network outage?). We log a warning message and try to resume the download a few times before
363
+ giving up. The method gives up after 5 attempts if no new data has being received from the server.
364
+
365
+ Args:
366
+ url (`str`):
367
+ The URL of the file to download.
368
+ temp_file (`BinaryIO`):
369
+ The file-like object where to save the file.
370
+ proxies (`dict`, *optional*):
371
+ Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
372
+ resume_size (`int`, *optional*):
373
+ The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
374
+ positive number, the download will resume at the given position.
375
+ headers (`dict`, *optional*):
376
+ Dictionary of HTTP Headers to send with the request.
377
+ expected_size (`int`, *optional*):
378
+ The expected size of the file to download. If set, the download will raise an error if the size of the
379
+ received content is different from the expected one.
380
+ displayed_filename (`str`, *optional*):
381
+ The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
382
+ not set, the filename is guessed from the URL or the `Content-Disposition` header.
383
+ """
384
+ if expected_size is not None and resume_size == expected_size:
385
+ # If the file is already fully downloaded, we don't need to download it again.
386
+ return
387
+
388
+ has_custom_range_header = headers is not None and any(h.lower() == "range" for h in headers)
389
+ hf_transfer = None
390
+ if constants.HF_HUB_ENABLE_HF_TRANSFER:
391
+ if resume_size != 0:
392
+ warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
393
+ elif proxies is not None:
394
+ warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
395
+ elif has_custom_range_header:
396
+ warnings.warn("'hf_transfer' ignores custom 'Range' headers; falling back to regular download method")
397
+ else:
398
+ try:
399
+ import hf_transfer # type: ignore[no-redef]
400
+ except ImportError:
401
+ raise ValueError(
402
+ "Fast download using 'hf_transfer' is enabled"
403
+ " (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
404
+ " available in your environment. Try `pip install hf_transfer`."
405
+ )
406
+
407
+ initial_headers = headers
408
+ headers = copy.deepcopy(headers) or {}
409
+ if resume_size > 0:
410
+ headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
411
+ elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
412
+ # Any files over 50GB will not be available through basic http request.
413
+ # Setting the range header to 0-0 will force the server to return the file size in the Content-Range header.
414
+ # Since hf_transfer splits the download into chunks, the process will succeed afterwards.
415
+ if hf_transfer:
416
+ headers["Range"] = "bytes=0-0"
417
+ else:
418
+ raise ValueError(
419
+ "The file is too large to be downloaded using the regular download method. Use `hf_transfer` or `hf_xet` instead."
420
+ " Try `pip install hf_transfer` or `pip install hf_xet`."
421
+ )
422
+
423
+ r = _request_wrapper(
424
+ method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
425
+ )
426
+
427
+ hf_raise_for_status(r)
428
+ total: Optional[int] = _get_file_length_from_http_response(r)
429
+
430
+ if displayed_filename is None:
431
+ displayed_filename = url
432
+ content_disposition = r.headers.get("Content-Disposition")
433
+ if content_disposition is not None:
434
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
435
+ if match is not None:
436
+ # Means file is on CDN
437
+ displayed_filename = match.groupdict()["filename"]
438
+
439
+ # Truncate filename if too long to display
440
+ if len(displayed_filename) > 40:
441
+ displayed_filename = f"(…){displayed_filename[-40:]}"
442
+
443
+ consistency_error_message = (
444
+ f"Consistency check failed: file should be of size {expected_size} but has size"
445
+ f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
446
+ " Please retry with `force_download=True`."
447
+ )
448
+ progress_cm = _get_progress_bar_context(
449
+ desc=displayed_filename,
450
+ log_level=logger.getEffectiveLevel(),
451
+ total=total,
452
+ initial=resume_size,
453
+ name="huggingface_hub.http_get",
454
+ _tqdm_bar=_tqdm_bar,
455
+ )
456
+
457
+ with progress_cm as progress:
458
+ if hf_transfer and total is not None and total > 5 * constants.DOWNLOAD_CHUNK_SIZE:
459
+ supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
460
+ if not supports_callback:
461
+ warnings.warn(
462
+ "You are using an outdated version of `hf_transfer`. "
463
+ "Consider upgrading to latest version to enable progress bars "
464
+ "using `pip install -U hf_transfer`."
465
+ )
466
+ try:
467
+ hf_transfer.download(
468
+ url=url,
469
+ filename=temp_file.name,
470
+ max_files=constants.HF_TRANSFER_CONCURRENCY,
471
+ chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
472
+ headers=initial_headers,
473
+ parallel_failures=3,
474
+ max_retries=5,
475
+ **({"callback": progress.update} if supports_callback else {}),
476
+ )
477
+ except Exception as e:
478
+ raise RuntimeError(
479
+ "An error occurred while downloading using `hf_transfer`. Consider"
480
+ " disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
481
+ ) from e
482
+ if not supports_callback:
483
+ progress.update(total)
484
+ if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
485
+ raise EnvironmentError(
486
+ consistency_error_message.format(
487
+ actual_size=os.path.getsize(temp_file.name),
488
+ )
489
+ )
490
+ return
491
+ new_resume_size = resume_size
492
+ try:
493
+ for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
494
+ if chunk: # filter out keep-alive new chunks
495
+ progress.update(len(chunk))
496
+ temp_file.write(chunk)
497
+ new_resume_size += len(chunk)
498
+ # Some data has been downloaded from the server so we reset the number of retries.
499
+ _nb_retries = 5
500
+ except (requests.ConnectionError, requests.ReadTimeout) as e:
501
+ # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
502
+ # a transient error (network outage?). We log a warning message and try to resume the download a few times
503
+ # before giving up. Tre retry mechanism is basic but should be enough in most cases.
504
+ if _nb_retries <= 0:
505
+ logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
506
+ raise
507
+ logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
508
+ time.sleep(1)
509
+ reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
510
+ return http_get(
511
+ url=url,
512
+ temp_file=temp_file,
513
+ proxies=proxies,
514
+ resume_size=new_resume_size,
515
+ headers=initial_headers,
516
+ expected_size=expected_size,
517
+ _nb_retries=_nb_retries - 1,
518
+ _tqdm_bar=_tqdm_bar,
519
+ )
520
+
521
+ if expected_size is not None and expected_size != temp_file.tell():
522
+ raise EnvironmentError(
523
+ consistency_error_message.format(
524
+ actual_size=temp_file.tell(),
525
+ )
526
+ )
527
+
528
+
529
+ def xet_get(
530
+ *,
531
+ incomplete_path: Path,
532
+ xet_file_data: XetFileData,
533
+ headers: Dict[str, str],
534
+ expected_size: Optional[int] = None,
535
+ displayed_filename: Optional[str] = None,
536
+ _tqdm_bar: Optional[tqdm] = None,
537
+ ) -> None:
538
+ """
539
+ Download a file using Xet storage service.
540
+
541
+ Args:
542
+ incomplete_path (`Path`):
543
+ The path to the file to download.
544
+ xet_file_data (`XetFileData`):
545
+ The file metadata needed to make the request to the xet storage service.
546
+ headers (`Dict[str, str]`):
547
+ The headers to send to the xet storage service.
548
+ expected_size (`int`, *optional*):
549
+ The expected size of the file to download. If set, the download will raise an error if the size of the
550
+ received content is different from the expected one.
551
+ displayed_filename (`str`, *optional*):
552
+ The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
553
+ not set, the filename is guessed from the URL or the `Content-Disposition` header.
554
+
555
+ **How it works:**
556
+ The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
557
+ for efficient storage and transfer.
558
+
559
+ `hf_xet.download_files` manages downloading files by:
560
+ - Taking a list of files to download (each with its unique content hash)
561
+ - Connecting to a storage server (CAS server) that knows how files are chunked
562
+ - Using authentication to ensure secure access
563
+ - Providing progress updates during download
564
+
565
+ Authentication works by regularly refreshing access tokens through `refresh_xet_connection_info` to maintain a valid
566
+ connection to the storage server.
567
+
568
+ The download process works like this:
569
+ 1. Create a local cache folder at `~/.cache/huggingface/xet/chunk-cache` to store reusable file chunks
570
+ 2. Download files in parallel:
571
+ 2.1. Prepare to write the file to disk
572
+ 2.2. Ask the server "how is this file split into chunks?" using the file's unique hash
573
+ The server responds with:
574
+ - Which chunks make up the complete file
575
+ - Where each chunk can be downloaded from
576
+ 2.3. For each needed chunk:
577
+ - Checks if we already have it in our local cache
578
+ - If not, download it from cloud storage (S3)
579
+ - Save it to cache for future use
580
+ - Assemble the chunks in order to recreate the original file
581
+
582
+ """
583
+ try:
584
+ from hf_xet import PyXetDownloadInfo, download_files # type: ignore[no-redef]
585
+ except ImportError:
586
+ raise ValueError(
587
+ "To use optimized download using Xet storage, you need to install the hf_xet package. "
588
+ 'Try `pip install "huggingface_hub[hf_xet]"` or `pip install hf_xet`.'
589
+ )
590
+
591
+ connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
592
+
593
+ def token_refresher() -> Tuple[str, int]:
594
+ connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
595
+ if connection_info is None:
596
+ raise ValueError("Failed to refresh token using xet metadata.")
597
+ return connection_info.access_token, connection_info.expiration_unix_epoch
598
+
599
+ xet_download_info = [
600
+ PyXetDownloadInfo(
601
+ destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
602
+ )
603
+ ]
604
+
605
+ if not displayed_filename:
606
+ displayed_filename = incomplete_path.name
607
+
608
+ # Truncate filename if too long to display
609
+ if len(displayed_filename) > 40:
610
+ displayed_filename = f"{displayed_filename[:40]}(…)"
611
+
612
+ progress_cm = _get_progress_bar_context(
613
+ desc=displayed_filename,
614
+ log_level=logger.getEffectiveLevel(),
615
+ total=expected_size,
616
+ initial=0,
617
+ name="huggingface_hub.xet_get",
618
+ _tqdm_bar=_tqdm_bar,
619
+ )
620
+
621
+ with progress_cm as progress:
622
+
623
+ def progress_updater(progress_bytes: float):
624
+ progress.update(progress_bytes)
625
+
626
+ download_files(
627
+ xet_download_info,
628
+ endpoint=connection_info.endpoint,
629
+ token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
630
+ token_refresher=token_refresher,
631
+ progress_updater=[progress_updater],
632
+ )
633
+
634
+
635
+ def _normalize_etag(etag: Optional[str]) -> Optional[str]:
636
+ """Normalize ETag HTTP header, so it can be used to create nice filepaths.
637
+
638
+ The HTTP spec allows two forms of ETag:
639
+ ETag: W/"<etag_value>"
640
+ ETag: "<etag_value>"
641
+
642
+ For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
643
+ more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
644
+
645
+ Args:
646
+ etag (`str`, *optional*): HTTP header
647
+
648
+ Returns:
649
+ `str` or `None`: string that can be used as a nice directory name.
650
+ Returns `None` if input is None.
651
+ """
652
+ if etag is None:
653
+ return None
654
+ return etag.lstrip("W/").strip('"')
655
+
656
+
657
+ def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None:
658
+ """Alias method used in `transformers` conversion script."""
659
+ return _create_symlink(src=src, dst=dst, new_blob=new_blob)
660
+
661
+
662
+ def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
663
+ """Create a symbolic link named dst pointing to src.
664
+
665
+ By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
666
+ - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
667
+ not break.
668
+ - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
669
+ changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
670
+ https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
671
+ NOTE: The issue with absolute paths doesn't happen on admin mode.
672
+ When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
673
+ This happens when paths are not on the same volume. In that case, we use absolute paths.
674
+
675
+
676
+ The result layout looks something like
677
+ └── [ 128] snapshots
678
+ ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
679
+ │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
680
+ │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
681
+
682
+ If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
683
+ having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
684
+ (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
685
+ cache, the file is duplicated on the disk.
686
+
687
+ In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
688
+ The warning message can be disabled with the `DISABLE_SYMLINKS_WARNING` environment variable.
689
+ """
690
+ try:
691
+ os.remove(dst)
692
+ except OSError:
693
+ pass
694
+
695
+ abs_src = os.path.abspath(os.path.expanduser(src))
696
+ abs_dst = os.path.abspath(os.path.expanduser(dst))
697
+ abs_dst_folder = os.path.dirname(abs_dst)
698
+
699
+ # Use relative_dst in priority
700
+ try:
701
+ relative_src = os.path.relpath(abs_src, abs_dst_folder)
702
+ except ValueError:
703
+ # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
704
+ # local_dir instead of within the cache directory.
705
+ # See https://docs.python.org/3/library/os.path.html#os.path.relpath
706
+ relative_src = None
707
+
708
+ try:
709
+ commonpath = os.path.commonpath([abs_src, abs_dst])
710
+ _support_symlinks = are_symlinks_supported(commonpath)
711
+ except ValueError:
712
+ # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
713
+ # See https://docs.python.org/3/library/os.path.html#os.path.commonpath
714
+ _support_symlinks = os.name != "nt"
715
+ except PermissionError:
716
+ # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
717
+ # by the user via `local_dir`. Let's test symlink support there)
718
+ _support_symlinks = are_symlinks_supported(abs_dst_folder)
719
+ except OSError as e:
720
+ # OS error (errno=30) means that the commonpath is readonly on Linux/MacOS.
721
+ if e.errno == errno.EROFS:
722
+ _support_symlinks = are_symlinks_supported(abs_dst_folder)
723
+ else:
724
+ raise
725
+
726
+ # Symlinks are supported => let's create a symlink.
727
+ if _support_symlinks:
728
+ src_rel_or_abs = relative_src or abs_src
729
+ logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
730
+ try:
731
+ os.symlink(src_rel_or_abs, abs_dst)
732
+ return
733
+ except FileExistsError:
734
+ if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
735
+ # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
736
+ # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
737
+ return
738
+ else:
739
+ # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
740
+ # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
741
+ raise
742
+ except PermissionError:
743
+ # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
744
+ # is supported on both volumes but not between them. Let's just make a hard copy in that case.
745
+ pass
746
+
747
+ # Symlinks are not supported => let's move or copy the file.
748
+ if new_blob:
749
+ logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
750
+ shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
751
+ else:
752
+ logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
753
+ shutil.copyfile(abs_src, abs_dst)
754
+
755
+
756
+ def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
757
+ """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
758
+
759
+ Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
760
+ """
761
+ if revision != commit_hash:
762
+ ref_path = Path(storage_folder) / "refs" / revision
763
+ ref_path.parent.mkdir(parents=True, exist_ok=True)
764
+ if not ref_path.exists() or commit_hash != ref_path.read_text():
765
+ # Update ref only if has been updated. Could cause useless error in case
766
+ # repo is already cached and user doesn't have write access to cache folder.
767
+ # See https://github.com/huggingface/huggingface_hub/issues/1216.
768
+ ref_path.write_text(commit_hash)
769
+
770
+
771
+ @validate_hf_hub_args
772
+ def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
773
+ """Return a serialized version of a hf.co repo name and type, safe for disk storage
774
+ as a single non-nested folder.
775
+
776
+ Example: models--julien-c--EsperBERTo-small
777
+ """
778
+ # remove all `/` occurrences to correctly convert repo to directory name
779
+ parts = [f"{repo_type}s", *repo_id.split("/")]
780
+ return constants.REPO_ID_SEPARATOR.join(parts)
781
+
782
+
783
+ def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
784
+ """Check disk usage and log a warning if there is not enough disk space to download the file.
785
+
786
+ Args:
787
+ expected_size (`int`):
788
+ The expected size of the file in bytes.
789
+ target_dir (`str`):
790
+ The directory where the file will be stored after downloading.
791
+ """
792
+
793
+ target_dir = Path(target_dir) # format as `Path`
794
+ for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one
795
+ try:
796
+ target_dir_free = shutil.disk_usage(path).free
797
+ if target_dir_free < expected_size:
798
+ warnings.warn(
799
+ "Not enough free disk space to download the file. "
800
+ f"The expected file size is: {expected_size / 1e6:.2f} MB. "
801
+ f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
802
+ )
803
+ return
804
+ except OSError: # raise on anything: file does not exist or space disk cannot be checked
805
+ pass
806
+
807
+
808
+ @validate_hf_hub_args
809
+ def hf_hub_download(
810
+ repo_id: str,
811
+ filename: str,
812
+ *,
813
+ subfolder: Optional[str] = None,
814
+ repo_type: Optional[str] = None,
815
+ revision: Optional[str] = None,
816
+ library_name: Optional[str] = None,
817
+ library_version: Optional[str] = None,
818
+ cache_dir: Union[str, Path, None] = None,
819
+ local_dir: Union[str, Path, None] = None,
820
+ user_agent: Union[Dict, str, None] = None,
821
+ force_download: bool = False,
822
+ proxies: Optional[Dict] = None,
823
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
824
+ token: Union[bool, str, None] = None,
825
+ local_files_only: bool = False,
826
+ headers: Optional[Dict[str, str]] = None,
827
+ endpoint: Optional[str] = None,
828
+ resume_download: Optional[bool] = None,
829
+ force_filename: Optional[str] = None,
830
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
831
+ ) -> str:
832
+ """Download a given file if it's not already present in the local cache.
833
+
834
+ The new cache file layout looks like this:
835
+ - The cache directory contains one subfolder per repo_id (namespaced by repo type)
836
+ - inside each repo folder:
837
+ - refs is a list of the latest known revision => commit_hash pairs
838
+ - blobs contains the actual file blobs (identified by their git-sha or sha256, depending on
839
+ whether they're LFS files or not)
840
+ - snapshots contains one subfolder per commit, each "commit" contains the subset of the files
841
+ that have been resolved at that particular commit. Each filename is a symlink to the blob
842
+ at that particular commit.
843
+
844
+ ```
845
+ [ 96] .
846
+ └── [ 160] models--julien-c--EsperBERTo-small
847
+ ├── [ 160] blobs
848
+ │ ├── [321M] 403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
849
+ │ ├── [ 398] 7cb18dc9bafbfcf74629a4b760af1b160957a83e
850
+ │ └── [1.4K] d7edf6bd2a681fb0175f7735299831ee1b22b812
851
+ ├── [ 96] refs
852
+ │ └── [ 40] main
853
+ └── [ 128] snapshots
854
+ ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
855
+ │ ├── [ 52] README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
856
+ │ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
857
+ └── [ 128] bbc77c8132af1cc5cf678da3f1ddf2de43606d48
858
+ ├── [ 52] README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
859
+ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
860
+ ```
861
+
862
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
863
+ option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
864
+ to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
865
+ cache-system, it's optimized for regularly pulling the latest version of a repository.
866
+
867
+ Args:
868
+ repo_id (`str`):
869
+ A user or an organization name and a repo name separated by a `/`.
870
+ filename (`str`):
871
+ The name of the file in the repo.
872
+ subfolder (`str`, *optional*):
873
+ An optional value corresponding to a folder inside the model repo.
874
+ repo_type (`str`, *optional*):
875
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
876
+ `None` or `"model"` if downloading from a model. Default is `None`.
877
+ revision (`str`, *optional*):
878
+ An optional Git revision id which can be a branch name, a tag, or a
879
+ commit hash.
880
+ library_name (`str`, *optional*):
881
+ The name of the library to which the object corresponds.
882
+ library_version (`str`, *optional*):
883
+ The version of the library.
884
+ cache_dir (`str`, `Path`, *optional*):
885
+ Path to the folder where cached files are stored.
886
+ local_dir (`str` or `Path`, *optional*):
887
+ If provided, the downloaded file will be placed under this directory.
888
+ user_agent (`dict`, `str`, *optional*):
889
+ The user-agent info in the form of a dictionary or a string.
890
+ force_download (`bool`, *optional*, defaults to `False`):
891
+ Whether the file should be downloaded even if it already exists in
892
+ the local cache.
893
+ proxies (`dict`, *optional*):
894
+ Dictionary mapping protocol to the URL of the proxy passed to
895
+ `requests.request`.
896
+ etag_timeout (`float`, *optional*, defaults to `10`):
897
+ When fetching ETag, how many seconds to wait for the server to send
898
+ data before giving up which is passed to `requests.request`.
899
+ token (`str`, `bool`, *optional*):
900
+ A token to be used for the download.
901
+ - If `True`, the token is read from the HuggingFace config
902
+ folder.
903
+ - If a string, it's used as the authentication token.
904
+ local_files_only (`bool`, *optional*, defaults to `False`):
905
+ If `True`, avoid downloading the file and return the path to the
906
+ local cached file if it exists.
907
+ headers (`dict`, *optional*):
908
+ Additional headers to be sent with the request.
909
+
910
+ Returns:
911
+ `str`: Local path of file or if networking is off, last version of file cached on disk.
912
+
913
+ Raises:
914
+ [`~utils.RepositoryNotFoundError`]
915
+ If the repository to download from cannot be found. This may be because it doesn't exist,
916
+ or because it is set to `private` and you do not have access.
917
+ [`~utils.RevisionNotFoundError`]
918
+ If the revision to download from cannot be found.
919
+ [`~utils.EntryNotFoundError`]
920
+ If the file to download cannot be found.
921
+ [`~utils.LocalEntryNotFoundError`]
922
+ If network is disabled or unavailable and file is not found in cache.
923
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
924
+ If `token=True` but the token cannot be found.
925
+ [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
926
+ If ETag cannot be determined.
927
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
928
+ If some parameter value is invalid.
929
+
930
+ """
931
+ if constants.HF_HUB_ETAG_TIMEOUT != constants.DEFAULT_ETAG_TIMEOUT:
932
+ # Respect environment variable above user value
933
+ etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
934
+
935
+ if force_filename is not None:
936
+ warnings.warn(
937
+ "The `force_filename` parameter is deprecated as a new caching system, "
938
+ "which keeps the filenames as they are on the Hub, is now in place.",
939
+ FutureWarning,
940
+ )
941
+ if resume_download is not None:
942
+ warnings.warn(
943
+ "`resume_download` is deprecated and will be removed in version 1.0.0. "
944
+ "Downloads always resume when possible. "
945
+ "If you want to force a new download, use `force_download=True`.",
946
+ FutureWarning,
947
+ )
948
+
949
+ if cache_dir is None:
950
+ cache_dir = constants.HF_HUB_CACHE
951
+ if revision is None:
952
+ revision = constants.DEFAULT_REVISION
953
+ if isinstance(cache_dir, Path):
954
+ cache_dir = str(cache_dir)
955
+ if isinstance(local_dir, Path):
956
+ local_dir = str(local_dir)
957
+
958
+ if subfolder == "":
959
+ subfolder = None
960
+ if subfolder is not None:
961
+ # This is used to create a URL, and not a local path, hence the forward slash.
962
+ filename = f"{subfolder}/{filename}"
963
+
964
+ if repo_type is None:
965
+ repo_type = "model"
966
+ if repo_type not in constants.REPO_TYPES:
967
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
968
+
969
+ hf_headers = build_hf_headers(
970
+ token=token,
971
+ library_name=library_name,
972
+ library_version=library_version,
973
+ user_agent=user_agent,
974
+ headers=headers,
975
+ )
976
+
977
+ if local_dir is not None:
978
+ if local_dir_use_symlinks != "auto":
979
+ warnings.warn(
980
+ "`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
981
+ "The process to download files to a local folder has been updated and do "
982
+ "not rely on symlinks anymore. You only need to pass a destination folder "
983
+ "as`local_dir`.\n"
984
+ "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
985
+ )
986
+
987
+ return _hf_hub_download_to_local_dir(
988
+ # Destination
989
+ local_dir=local_dir,
990
+ # File info
991
+ repo_id=repo_id,
992
+ repo_type=repo_type,
993
+ filename=filename,
994
+ revision=revision,
995
+ # HTTP info
996
+ endpoint=endpoint,
997
+ etag_timeout=etag_timeout,
998
+ headers=hf_headers,
999
+ proxies=proxies,
1000
+ token=token,
1001
+ # Additional options
1002
+ cache_dir=cache_dir,
1003
+ force_download=force_download,
1004
+ local_files_only=local_files_only,
1005
+ )
1006
+ else:
1007
+ return _hf_hub_download_to_cache_dir(
1008
+ # Destination
1009
+ cache_dir=cache_dir,
1010
+ # File info
1011
+ repo_id=repo_id,
1012
+ filename=filename,
1013
+ repo_type=repo_type,
1014
+ revision=revision,
1015
+ # HTTP info
1016
+ endpoint=endpoint,
1017
+ etag_timeout=etag_timeout,
1018
+ headers=hf_headers,
1019
+ proxies=proxies,
1020
+ token=token,
1021
+ # Additional options
1022
+ local_files_only=local_files_only,
1023
+ force_download=force_download,
1024
+ )
1025
+
1026
+
1027
+ def _hf_hub_download_to_cache_dir(
1028
+ *,
1029
+ # Destination
1030
+ cache_dir: str,
1031
+ # File info
1032
+ repo_id: str,
1033
+ filename: str,
1034
+ repo_type: str,
1035
+ revision: str,
1036
+ # HTTP info
1037
+ endpoint: Optional[str],
1038
+ etag_timeout: float,
1039
+ headers: Dict[str, str],
1040
+ proxies: Optional[Dict],
1041
+ token: Optional[Union[bool, str]],
1042
+ # Additional options
1043
+ local_files_only: bool,
1044
+ force_download: bool,
1045
+ ) -> str:
1046
+ """Download a given file to a cache folder, if not already present.
1047
+
1048
+ Method should not be called directly. Please use `hf_hub_download` instead.
1049
+ """
1050
+ locks_dir = os.path.join(cache_dir, ".locks")
1051
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
1052
+
1053
+ # cross platform transcription of filename, to be used as a local file path.
1054
+ relative_filename = os.path.join(*filename.split("/"))
1055
+ if os.name == "nt":
1056
+ if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
1057
+ raise ValueError(
1058
+ f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
1059
+ " owner to rename this file."
1060
+ )
1061
+
1062
+ # if user provides a commit_hash and they already have the file on disk, shortcut everything.
1063
+ if REGEX_COMMIT_HASH.match(revision):
1064
+ pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
1065
+ if os.path.exists(pointer_path) and not force_download:
1066
+ return pointer_path
1067
+
1068
+ # Try to get metadata (etag, commit_hash, url, size) from the server.
1069
+ # If we can't, a HEAD request error is returned.
1070
+ (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
1071
+ repo_id=repo_id,
1072
+ filename=filename,
1073
+ repo_type=repo_type,
1074
+ revision=revision,
1075
+ endpoint=endpoint,
1076
+ proxies=proxies,
1077
+ etag_timeout=etag_timeout,
1078
+ headers=headers,
1079
+ token=token,
1080
+ local_files_only=local_files_only,
1081
+ storage_folder=storage_folder,
1082
+ relative_filename=relative_filename,
1083
+ )
1084
+
1085
+ # etag can be None for several reasons:
1086
+ # 1. we passed local_files_only.
1087
+ # 2. we don't have a connection
1088
+ # 3. Hub is down (HTTP 500, 503, 504)
1089
+ # 4. repo is not found -for example private or gated- and invalid/missing token sent
1090
+ # 5. Hub is blocked by a firewall or proxy is not set correctly.
1091
+ # => Try to get the last downloaded one from the specified revision.
1092
+ #
1093
+ # If the specified revision is a commit hash, look inside "snapshots".
1094
+ # If the specified revision is a branch or tag, look inside "refs".
1095
+ if head_call_error is not None:
1096
+ # Couldn't make a HEAD call => let's try to find a local file
1097
+ if not force_download:
1098
+ commit_hash = None
1099
+ if REGEX_COMMIT_HASH.match(revision):
1100
+ commit_hash = revision
1101
+ else:
1102
+ ref_path = os.path.join(storage_folder, "refs", revision)
1103
+ if os.path.isfile(ref_path):
1104
+ with open(ref_path) as f:
1105
+ commit_hash = f.read()
1106
+
1107
+ # Return pointer file if exists
1108
+ if commit_hash is not None:
1109
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
1110
+ if os.path.exists(pointer_path) and not force_download:
1111
+ return pointer_path
1112
+
1113
+ # Otherwise, raise appropriate error
1114
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
1115
+
1116
+ # From now on, etag, commit_hash, url and size are not None.
1117
+ assert etag is not None, "etag must have been retrieved from server"
1118
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
1119
+ assert url_to_download is not None, "file location must have been retrieved from server"
1120
+ assert expected_size is not None, "expected_size must have been retrieved from server"
1121
+ blob_path = os.path.join(storage_folder, "blobs", etag)
1122
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
1123
+
1124
+ os.makedirs(os.path.dirname(blob_path), exist_ok=True)
1125
+ os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
1126
+
1127
+ # if passed revision is not identical to commit_hash
1128
+ # then revision has to be a branch name or tag name.
1129
+ # In that case store a ref.
1130
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1131
+
1132
+ # Prevent parallel downloads of the same file with a lock.
1133
+ # etag could be duplicated across repos,
1134
+ lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
1135
+
1136
+ # Some Windows versions do not allow for paths longer than 255 characters.
1137
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix.
1138
+ if (
1139
+ os.name == "nt"
1140
+ and len(os.path.abspath(lock_path)) > 255
1141
+ and not os.path.abspath(lock_path).startswith("\\\\?\\")
1142
+ ):
1143
+ lock_path = "\\\\?\\" + os.path.abspath(lock_path)
1144
+
1145
+ if (
1146
+ os.name == "nt"
1147
+ and len(os.path.abspath(blob_path)) > 255
1148
+ and not os.path.abspath(blob_path).startswith("\\\\?\\")
1149
+ ):
1150
+ blob_path = "\\\\?\\" + os.path.abspath(blob_path)
1151
+
1152
+ Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
1153
+
1154
+ # pointer already exists -> immediate return
1155
+ if not force_download and os.path.exists(pointer_path):
1156
+ return pointer_path
1157
+
1158
+ # Blob exists but pointer must be (safely) created -> take the lock
1159
+ if not force_download and os.path.exists(blob_path):
1160
+ with WeakFileLock(lock_path):
1161
+ if not os.path.exists(pointer_path):
1162
+ _create_symlink(blob_path, pointer_path, new_blob=False)
1163
+ return pointer_path
1164
+
1165
+ # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
1166
+
1167
+ with WeakFileLock(lock_path):
1168
+ _download_to_tmp_and_move(
1169
+ incomplete_path=Path(blob_path + ".incomplete"),
1170
+ destination_path=Path(blob_path),
1171
+ url_to_download=url_to_download,
1172
+ proxies=proxies,
1173
+ headers=headers,
1174
+ expected_size=expected_size,
1175
+ filename=filename,
1176
+ force_download=force_download,
1177
+ etag=etag,
1178
+ xet_file_data=xet_file_data,
1179
+ )
1180
+ if not os.path.exists(pointer_path):
1181
+ _create_symlink(blob_path, pointer_path, new_blob=True)
1182
+
1183
+ return pointer_path
1184
+
1185
+
1186
+ def _hf_hub_download_to_local_dir(
1187
+ *,
1188
+ # Destination
1189
+ local_dir: Union[str, Path],
1190
+ # File info
1191
+ repo_id: str,
1192
+ repo_type: str,
1193
+ filename: str,
1194
+ revision: str,
1195
+ # HTTP info
1196
+ endpoint: Optional[str],
1197
+ etag_timeout: float,
1198
+ headers: Dict[str, str],
1199
+ proxies: Optional[Dict],
1200
+ token: Union[bool, str, None],
1201
+ # Additional options
1202
+ cache_dir: str,
1203
+ force_download: bool,
1204
+ local_files_only: bool,
1205
+ ) -> str:
1206
+ """Download a given file to a local folder, if not already present.
1207
+
1208
+ Method should not be called directly. Please use `hf_hub_download` instead.
1209
+ """
1210
+ # Some Windows versions do not allow for paths longer than 255 characters.
1211
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix.
1212
+ if os.name == "nt" and len(os.path.abspath(local_dir)) > 255:
1213
+ local_dir = "\\\\?\\" + os.path.abspath(local_dir)
1214
+ local_dir = Path(local_dir)
1215
+ paths = get_local_download_paths(local_dir=local_dir, filename=filename)
1216
+ local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
1217
+
1218
+ # Local file exists + metadata exists + commit_hash matches => return file
1219
+ if (
1220
+ not force_download
1221
+ and REGEX_COMMIT_HASH.match(revision)
1222
+ and paths.file_path.is_file()
1223
+ and local_metadata is not None
1224
+ and local_metadata.commit_hash == revision
1225
+ ):
1226
+ return str(paths.file_path)
1227
+
1228
+ # Local file doesn't exist or commit_hash doesn't match => we need the etag
1229
+ (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
1230
+ repo_id=repo_id,
1231
+ filename=filename,
1232
+ repo_type=repo_type,
1233
+ revision=revision,
1234
+ endpoint=endpoint,
1235
+ proxies=proxies,
1236
+ etag_timeout=etag_timeout,
1237
+ headers=headers,
1238
+ token=token,
1239
+ local_files_only=local_files_only,
1240
+ )
1241
+
1242
+ if head_call_error is not None:
1243
+ # No HEAD call but local file exists => default to local file
1244
+ if not force_download and paths.file_path.is_file():
1245
+ logger.warning(
1246
+ f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
1247
+ )
1248
+ return str(paths.file_path)
1249
+ # Otherwise => raise
1250
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
1251
+
1252
+ # From now on, etag, commit_hash, url and size are not None.
1253
+ assert etag is not None, "etag must have been retrieved from server"
1254
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
1255
+ assert url_to_download is not None, "file location must have been retrieved from server"
1256
+ assert expected_size is not None, "expected_size must have been retrieved from server"
1257
+
1258
+ # Local file exists => check if it's up-to-date
1259
+ if not force_download and paths.file_path.is_file():
1260
+ # etag matches => update metadata and return file
1261
+ if local_metadata is not None and local_metadata.etag == etag:
1262
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1263
+ return str(paths.file_path)
1264
+
1265
+ # metadata is outdated + etag is a sha256
1266
+ # => means it's an LFS file (large)
1267
+ # => let's compute local hash and compare
1268
+ # => if match, update metadata and return file
1269
+ if local_metadata is None and REGEX_SHA256.match(etag) is not None:
1270
+ with open(paths.file_path, "rb") as f:
1271
+ file_hash = sha_fileobj(f).hex()
1272
+ if file_hash == etag:
1273
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1274
+ return str(paths.file_path)
1275
+
1276
+ # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
1277
+
1278
+ # If we are lucky enough, the file is already in the cache => copy it
1279
+ if not force_download:
1280
+ cached_path = try_to_load_from_cache(
1281
+ repo_id=repo_id,
1282
+ filename=filename,
1283
+ cache_dir=cache_dir,
1284
+ revision=commit_hash,
1285
+ repo_type=repo_type,
1286
+ )
1287
+ if isinstance(cached_path, str):
1288
+ with WeakFileLock(paths.lock_path):
1289
+ paths.file_path.parent.mkdir(parents=True, exist_ok=True)
1290
+ shutil.copyfile(cached_path, paths.file_path)
1291
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1292
+ return str(paths.file_path)
1293
+
1294
+ # Otherwise, let's download the file!
1295
+ with WeakFileLock(paths.lock_path):
1296
+ paths.file_path.unlink(missing_ok=True) # delete outdated file first
1297
+ _download_to_tmp_and_move(
1298
+ incomplete_path=paths.incomplete_path(etag),
1299
+ destination_path=paths.file_path,
1300
+ url_to_download=url_to_download,
1301
+ proxies=proxies,
1302
+ headers=headers,
1303
+ expected_size=expected_size,
1304
+ filename=filename,
1305
+ force_download=force_download,
1306
+ etag=etag,
1307
+ xet_file_data=xet_file_data,
1308
+ )
1309
+
1310
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1311
+ return str(paths.file_path)
1312
+
1313
+
1314
+ @validate_hf_hub_args
1315
+ def try_to_load_from_cache(
1316
+ repo_id: str,
1317
+ filename: str,
1318
+ cache_dir: Union[str, Path, None] = None,
1319
+ revision: Optional[str] = None,
1320
+ repo_type: Optional[str] = None,
1321
+ ) -> Union[str, _CACHED_NO_EXIST_T, None]:
1322
+ """
1323
+ Explores the cache to return the latest cached file for a given revision if found.
1324
+
1325
+ This function will not raise any exception if the file in not cached.
1326
+
1327
+ Args:
1328
+ cache_dir (`str` or `os.PathLike`):
1329
+ The folder where the cached files lie.
1330
+ repo_id (`str`):
1331
+ The ID of the repo on huggingface.co.
1332
+ filename (`str`):
1333
+ The filename to look for inside `repo_id`.
1334
+ revision (`str`, *optional*):
1335
+ The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
1336
+ provided either.
1337
+ repo_type (`str`, *optional*):
1338
+ The type of the repository. Will default to `"model"`.
1339
+
1340
+ Returns:
1341
+ `Optional[str]` or `_CACHED_NO_EXIST`:
1342
+ Will return `None` if the file was not cached. Otherwise:
1343
+ - The exact path to the cached file if it's found in the cache
1344
+ - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
1345
+ cached.
1346
+
1347
+ Example:
1348
+
1349
+ ```python
1350
+ from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
1351
+
1352
+ filepath = try_to_load_from_cache()
1353
+ if isinstance(filepath, str):
1354
+ # file exists and is cached
1355
+ ...
1356
+ elif filepath is _CACHED_NO_EXIST:
1357
+ # non-existence of file is cached
1358
+ ...
1359
+ else:
1360
+ # file is not cached
1361
+ ...
1362
+ ```
1363
+ """
1364
+ if revision is None:
1365
+ revision = "main"
1366
+ if repo_type is None:
1367
+ repo_type = "model"
1368
+ if repo_type not in constants.REPO_TYPES:
1369
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
1370
+ if cache_dir is None:
1371
+ cache_dir = constants.HF_HUB_CACHE
1372
+
1373
+ object_id = repo_id.replace("/", "--")
1374
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
1375
+ if not os.path.isdir(repo_cache):
1376
+ # No cache for this model
1377
+ return None
1378
+
1379
+ refs_dir = os.path.join(repo_cache, "refs")
1380
+ snapshots_dir = os.path.join(repo_cache, "snapshots")
1381
+ no_exist_dir = os.path.join(repo_cache, ".no_exist")
1382
+
1383
+ # Resolve refs (for instance to convert main to the associated commit sha)
1384
+ if os.path.isdir(refs_dir):
1385
+ revision_file = os.path.join(refs_dir, revision)
1386
+ if os.path.isfile(revision_file):
1387
+ with open(revision_file) as f:
1388
+ revision = f.read()
1389
+
1390
+ # Check if file is cached as "no_exist"
1391
+ if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
1392
+ return _CACHED_NO_EXIST
1393
+
1394
+ # Check if revision folder exists
1395
+ if not os.path.exists(snapshots_dir):
1396
+ return None
1397
+ cached_shas = os.listdir(snapshots_dir)
1398
+ if revision not in cached_shas:
1399
+ # No cache for this revision and we won't try to return a random revision
1400
+ return None
1401
+
1402
+ # Check if file exists in cache
1403
+ cached_file = os.path.join(snapshots_dir, revision, filename)
1404
+ return cached_file if os.path.isfile(cached_file) else None
1405
+
1406
+
1407
+ @validate_hf_hub_args
1408
+ def get_hf_file_metadata(
1409
+ url: str,
1410
+ token: Union[bool, str, None] = None,
1411
+ proxies: Optional[Dict] = None,
1412
+ timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
1413
+ library_name: Optional[str] = None,
1414
+ library_version: Optional[str] = None,
1415
+ user_agent: Union[Dict, str, None] = None,
1416
+ headers: Optional[Dict[str, str]] = None,
1417
+ endpoint: Optional[str] = None,
1418
+ ) -> HfFileMetadata:
1419
+ """Fetch metadata of a file versioned on the Hub for a given url.
1420
+
1421
+ Args:
1422
+ url (`str`):
1423
+ File url, for example returned by [`hf_hub_url`].
1424
+ token (`str` or `bool`, *optional*):
1425
+ A token to be used for the download.
1426
+ - If `True`, the token is read from the HuggingFace config
1427
+ folder.
1428
+ - If `False` or `None`, no token is provided.
1429
+ - If a string, it's used as the authentication token.
1430
+ proxies (`dict`, *optional*):
1431
+ Dictionary mapping protocol to the URL of the proxy passed to
1432
+ `requests.request`.
1433
+ timeout (`float`, *optional*, defaults to 10):
1434
+ How many seconds to wait for the server to send metadata before giving up.
1435
+ library_name (`str`, *optional*):
1436
+ The name of the library to which the object corresponds.
1437
+ library_version (`str`, *optional*):
1438
+ The version of the library.
1439
+ user_agent (`dict`, `str`, *optional*):
1440
+ The user-agent info in the form of a dictionary or a string.
1441
+ headers (`dict`, *optional*):
1442
+ Additional headers to be sent with the request.
1443
+ endpoint (`str`, *optional*):
1444
+ Endpoint of the Hub. Defaults to <https://huggingface.co>.
1445
+
1446
+ Returns:
1447
+ A [`HfFileMetadata`] object containing metadata such as location, etag, size and
1448
+ commit_hash.
1449
+ """
1450
+ hf_headers = build_hf_headers(
1451
+ token=token,
1452
+ library_name=library_name,
1453
+ library_version=library_version,
1454
+ user_agent=user_agent,
1455
+ headers=headers,
1456
+ )
1457
+ hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
1458
+
1459
+ # Retrieve metadata
1460
+ r = _request_wrapper(
1461
+ method="HEAD",
1462
+ url=url,
1463
+ headers=hf_headers,
1464
+ allow_redirects=False,
1465
+ follow_relative_redirects=True,
1466
+ proxies=proxies,
1467
+ timeout=timeout,
1468
+ )
1469
+ hf_raise_for_status(r)
1470
+
1471
+ # Return
1472
+ return HfFileMetadata(
1473
+ commit_hash=r.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
1474
+ # We favor a custom header indicating the etag of the linked resource, and
1475
+ # we fallback to the regular etag header.
1476
+ etag=_normalize_etag(r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")),
1477
+ # Either from response headers (if redirected) or defaults to request url
1478
+ # Do not use directly `url`, as `_request_wrapper` might have followed relative
1479
+ # redirects.
1480
+ location=r.headers.get("Location") or r.request.url, # type: ignore
1481
+ size=_int_or_none(
1482
+ r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")
1483
+ ),
1484
+ xet_file_data=parse_xet_file_data_from_response(r, endpoint=endpoint), # type: ignore
1485
+ )
1486
+
1487
+
1488
+ def _get_metadata_or_catch_error(
1489
+ *,
1490
+ repo_id: str,
1491
+ filename: str,
1492
+ repo_type: str,
1493
+ revision: str,
1494
+ endpoint: Optional[str],
1495
+ proxies: Optional[Dict],
1496
+ etag_timeout: Optional[float],
1497
+ headers: Dict[str, str], # mutated inplace!
1498
+ token: Union[bool, str, None],
1499
+ local_files_only: bool,
1500
+ relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
1501
+ storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
1502
+ ) -> Union[
1503
+ # Either an exception is caught and returned
1504
+ Tuple[None, None, None, None, None, Exception],
1505
+ # Or the metadata is returned as
1506
+ # `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
1507
+ Tuple[str, str, str, int, Optional[XetFileData], None],
1508
+ ]:
1509
+ """Get metadata for a file on the Hub, safely handling network issues.
1510
+
1511
+ Returns either the etag, commit_hash and expected size of the file, or the error
1512
+ raised while fetching the metadata.
1513
+
1514
+ NOTE: This function mutates `headers` inplace! It removes the `authorization` header
1515
+ if the file is a LFS blob and the domain of the url is different from the
1516
+ domain of the location (typically an S3 bucket).
1517
+ """
1518
+ if local_files_only:
1519
+ return (
1520
+ None,
1521
+ None,
1522
+ None,
1523
+ None,
1524
+ None,
1525
+ OfflineModeIsEnabled(
1526
+ f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
1527
+ ),
1528
+ )
1529
+
1530
+ url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
1531
+ url_to_download: str = url
1532
+ etag: Optional[str] = None
1533
+ commit_hash: Optional[str] = None
1534
+ expected_size: Optional[int] = None
1535
+ head_error_call: Optional[Exception] = None
1536
+ xet_file_data: Optional[XetFileData] = None
1537
+
1538
+ # Try to get metadata from the server.
1539
+ # Do not raise yet if the file is not found or not accessible.
1540
+ if not local_files_only:
1541
+ try:
1542
+ try:
1543
+ metadata = get_hf_file_metadata(
1544
+ url=url, proxies=proxies, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
1545
+ )
1546
+ except EntryNotFoundError as http_error:
1547
+ if storage_folder is not None and relative_filename is not None:
1548
+ # Cache the non-existence of the file
1549
+ commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
1550
+ if commit_hash is not None:
1551
+ no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
1552
+ try:
1553
+ no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
1554
+ no_exist_file_path.touch()
1555
+ except OSError as e:
1556
+ logger.error(
1557
+ f"Could not cache non-existence of file. Will ignore error and continue. Error: {e}"
1558
+ )
1559
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1560
+ raise
1561
+
1562
+ # Commit hash must exist
1563
+ commit_hash = metadata.commit_hash
1564
+ if commit_hash is None:
1565
+ raise FileMetadataError(
1566
+ "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
1567
+ " prevents you from downloading resources from https://huggingface.co. Please check your firewall"
1568
+ " and proxy settings and make sure your SSL certificates are updated."
1569
+ )
1570
+
1571
+ # Etag must exist
1572
+ # If we don't have any of those, raise an error.
1573
+ etag = metadata.etag
1574
+ if etag is None:
1575
+ raise FileMetadataError(
1576
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
1577
+ )
1578
+
1579
+ # Size must exist
1580
+ expected_size = metadata.size
1581
+ if expected_size is None:
1582
+ raise FileMetadataError("Distant resource does not have a Content-Length.")
1583
+
1584
+ xet_file_data = metadata.xet_file_data
1585
+
1586
+ # In case of a redirect, save an extra redirect on the request.get call,
1587
+ # and ensure we download the exact atomic version even if it changed
1588
+ # between the HEAD and the GET (unlikely, but hey).
1589
+ #
1590
+ # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
1591
+ # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
1592
+ if xet_file_data is None and url != metadata.location:
1593
+ url_to_download = metadata.location
1594
+ if urlparse(url).netloc != urlparse(metadata.location).netloc:
1595
+ # Remove authorization header when downloading a LFS blob
1596
+ headers.pop("authorization", None)
1597
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
1598
+ # Actually raise for those subclasses of ConnectionError
1599
+ raise
1600
+ except (
1601
+ requests.exceptions.ConnectionError,
1602
+ requests.exceptions.Timeout,
1603
+ OfflineModeIsEnabled,
1604
+ ) as error:
1605
+ # Otherwise, our Internet connection is down.
1606
+ # etag is None
1607
+ head_error_call = error
1608
+ except (RevisionNotFoundError, EntryNotFoundError):
1609
+ # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
1610
+ raise
1611
+ except requests.HTTPError as error:
1612
+ # Multiple reasons for an http error:
1613
+ # - Repository is private and invalid/missing token sent
1614
+ # - Repository is gated and invalid/missing token sent
1615
+ # - Hub is down (error 500 or 504)
1616
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
1617
+ # (if it's not the case, the error will be re-raised)
1618
+ head_error_call = error
1619
+ except FileMetadataError as error:
1620
+ # Multiple reasons for a FileMetadataError:
1621
+ # - Wrong network configuration (proxy, firewall, SSL certificates)
1622
+ # - Inconsistency on the Hub
1623
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
1624
+ # (if it's not the case, the error will be re-raised)
1625
+ head_error_call = error
1626
+
1627
+ if not (local_files_only or etag is not None or head_error_call is not None):
1628
+ raise RuntimeError("etag is empty due to uncovered problems")
1629
+
1630
+ return (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_error_call) # type: ignore [return-value]
1631
+
1632
+
1633
+ def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
1634
+ """Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
1635
+ # No head call => we cannot force download.
1636
+ if force_download:
1637
+ if local_files_only:
1638
+ raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
1639
+ elif isinstance(head_call_error, OfflineModeIsEnabled):
1640
+ raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
1641
+ else:
1642
+ raise ValueError("Force download failed due to the above error.") from head_call_error
1643
+
1644
+ # No head call + couldn't find an appropriate file on disk => raise an error.
1645
+ if local_files_only:
1646
+ raise LocalEntryNotFoundError(
1647
+ "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
1648
+ " hf.co look-ups and downloads online, set 'local_files_only' to False."
1649
+ )
1650
+ elif isinstance(head_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
1651
+ isinstance(head_call_error, HfHubHTTPError) and head_call_error.response.status_code == 401
1652
+ ):
1653
+ # Repo not found or gated => let's raise the actual error
1654
+ # Unauthorized => likely a token issue => let's raise the actual error
1655
+ raise head_call_error
1656
+ else:
1657
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
1658
+ raise LocalEntryNotFoundError(
1659
+ "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
1660
+ " in the local cache. Please check your connection and try again or make sure your Internet connection"
1661
+ " is on."
1662
+ ) from head_call_error
1663
+
1664
+
1665
+ def _download_to_tmp_and_move(
1666
+ incomplete_path: Path,
1667
+ destination_path: Path,
1668
+ url_to_download: str,
1669
+ proxies: Optional[Dict],
1670
+ headers: Dict[str, str],
1671
+ expected_size: Optional[int],
1672
+ filename: str,
1673
+ force_download: bool,
1674
+ etag: Optional[str],
1675
+ xet_file_data: Optional[XetFileData],
1676
+ ) -> None:
1677
+ """Download content from a URL to a destination path.
1678
+
1679
+ Internal logic:
1680
+ - return early if file is already downloaded
1681
+ - resume download if possible (from incomplete file)
1682
+ - do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
1683
+ - check disk space before downloading
1684
+ - download content to a temporary file
1685
+ - set correct permissions on temporary file
1686
+ - move the temporary file to the destination path
1687
+
1688
+ Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
1689
+ """
1690
+ if destination_path.exists() and not force_download:
1691
+ # Do nothing if already exists (except if force_download=True)
1692
+ return
1693
+
1694
+ if incomplete_path.exists() and (force_download or (constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
1695
+ # By default, we will try to resume the download if possible.
1696
+ # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
1697
+ # not resume the download => delete the incomplete file.
1698
+ message = f"Removing incomplete file '{incomplete_path}'"
1699
+ if force_download:
1700
+ message += " (force_download=True)"
1701
+ elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
1702
+ message += " (hf_transfer=True)"
1703
+ logger.info(message)
1704
+ incomplete_path.unlink(missing_ok=True)
1705
+
1706
+ with incomplete_path.open("ab") as f:
1707
+ resume_size = f.tell()
1708
+ message = f"Downloading '{filename}' to '{incomplete_path}'"
1709
+ if resume_size > 0 and expected_size is not None:
1710
+ message += f" (resume from {resume_size}/{expected_size})"
1711
+ logger.info(message)
1712
+
1713
+ if expected_size is not None: # might be None if HTTP header not set correctly
1714
+ # Check disk space in both tmp and destination path
1715
+ _check_disk_space(expected_size, incomplete_path.parent)
1716
+ _check_disk_space(expected_size, destination_path.parent)
1717
+
1718
+ if xet_file_data is not None and is_xet_available():
1719
+ logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
1720
+ xet_get(
1721
+ incomplete_path=incomplete_path,
1722
+ xet_file_data=xet_file_data,
1723
+ headers=headers,
1724
+ expected_size=expected_size,
1725
+ displayed_filename=filename,
1726
+ )
1727
+ else:
1728
+ if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
1729
+ logger.warning(
1730
+ "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
1731
+ "Falling back to regular HTTP download. "
1732
+ "For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`"
1733
+ )
1734
+
1735
+ http_get(
1736
+ url_to_download,
1737
+ f,
1738
+ proxies=proxies,
1739
+ resume_size=resume_size,
1740
+ headers=headers,
1741
+ expected_size=expected_size,
1742
+ )
1743
+
1744
+ logger.info(f"Download complete. Moving file to {destination_path}")
1745
+ _chmod_and_move(incomplete_path, destination_path)
1746
+
1747
+
1748
+ def _int_or_none(value: Optional[str]) -> Optional[int]:
1749
+ try:
1750
+ return int(value) # type: ignore
1751
+ except (TypeError, ValueError):
1752
+ return None
1753
+
1754
+
1755
+ def _chmod_and_move(src: Path, dst: Path) -> None:
1756
+ """Set correct permission before moving a blob from tmp directory to cache dir.
1757
+
1758
+ Do not take into account the `umask` from the process as there is no convenient way
1759
+ to get it that is thread-safe.
1760
+
1761
+ See:
1762
+ - About umask: https://docs.python.org/3/library/os.html#os.umask
1763
+ - Thread-safety: https://stackoverflow.com/a/70343066
1764
+ - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
1765
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
1766
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
1767
+ """
1768
+ # Get umask by creating a temporary file in the cached repo folder.
1769
+ tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
1770
+ try:
1771
+ tmp_file.touch()
1772
+ cache_dir_mode = Path(tmp_file).stat().st_mode
1773
+ os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
1774
+ except OSError as e:
1775
+ logger.warning(
1776
+ f"Could not set the permissions on the file '{src}'. Error: {e}.\nContinuing without setting permissions."
1777
+ )
1778
+ finally:
1779
+ try:
1780
+ tmp_file.unlink()
1781
+ except OSError:
1782
+ # fails if `tmp_file.touch()` failed => do nothing
1783
+ # See https://github.com/huggingface/huggingface_hub/issues/2359
1784
+ pass
1785
+
1786
+ shutil.move(str(src), str(dst), copy_function=_copy_no_matter_what)
1787
+
1788
+
1789
+ def _copy_no_matter_what(src: str, dst: str) -> None:
1790
+ """Copy file from src to dst.
1791
+
1792
+ If `shutil.copy2` fails, fallback to `shutil.copyfile`.
1793
+ """
1794
+ try:
1795
+ # Copy file with metadata and permission
1796
+ # Can fail e.g. if dst is an S3 mount
1797
+ shutil.copy2(src, dst)
1798
+ except OSError:
1799
+ # Copy only file content
1800
+ shutil.copyfile(src, dst)
1801
+
1802
+
1803
+ def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
1804
+ # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
1805
+ snapshot_path = os.path.join(storage_folder, "snapshots")
1806
+ pointer_path = os.path.join(snapshot_path, revision, relative_filename)
1807
+ if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
1808
+ raise ValueError(
1809
+ "Invalid pointer path: cannot create pointer path in snapshot folder if"
1810
+ f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
1811
+ f" `relative_filename='{relative_filename}'`."
1812
+ )
1813
+ return pointer_path