Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/INSTALLER +1 -0
- venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/METADATA +78 -0
- venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/RECORD +14 -0
- venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/WHEEL +5 -0
- venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/top_level.txt +1 -0
- venv/lib/python3.13/site-packages/certifi/__init__.py +4 -0
- venv/lib/python3.13/site-packages/certifi/__main__.py +12 -0
- venv/lib/python3.13/site-packages/certifi/cacert.pem +0 -0
- venv/lib/python3.13/site-packages/certifi/core.py +83 -0
- venv/lib/python3.13/site-packages/certifi/py.typed +0 -0
- venv/lib/python3.13/site-packages/charset_normalizer/__init__.py +48 -0
- venv/lib/python3.13/site-packages/charset_normalizer/__main__.py +6 -0
- venv/lib/python3.13/site-packages/charset_normalizer/api.py +669 -0
- venv/lib/python3.13/site-packages/charset_normalizer/cd.py +395 -0
- venv/lib/python3.13/site-packages/charset_normalizer/constant.py +2015 -0
- venv/lib/python3.13/site-packages/charset_normalizer/legacy.py +80 -0
- venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-x86_64-linux-gnu.so +0 -0
- venv/lib/python3.13/site-packages/charset_normalizer/md.py +635 -0
- venv/lib/python3.13/site-packages/charset_normalizer/models.py +360 -0
- venv/lib/python3.13/site-packages/charset_normalizer/py.typed +0 -0
- venv/lib/python3.13/site-packages/charset_normalizer/utils.py +414 -0
- venv/lib/python3.13/site-packages/charset_normalizer/version.py +8 -0
- venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/INSTALLER +1 -0
- venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/METADATA +42 -0
- venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/RECORD +24 -0
- venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/WHEEL +4 -0
- venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/INSTALLER +1 -0
- venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/METADATA +87 -0
- venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/RECORD +8 -0
- venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/WHEEL +4 -0
- venv/lib/python3.13/site-packages/huggingface_hub/__init__.py +1554 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_commit_api.py +968 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py +350 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py +413 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_jobs_api.py +301 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_local_folder.py +447 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_login.py +514 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_oauth.py +460 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py +343 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_space_api.py +168 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_tensorboard_logger.py +190 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py +755 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_payload.py +137 -0
- venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py +376 -0
- venv/lib/python3.13/site-packages/huggingface_hub/community.py +363 -0
- venv/lib/python3.13/site-packages/huggingface_hub/constants.py +294 -0
- venv/lib/python3.13/site-packages/huggingface_hub/dataclasses.py +484 -0
- venv/lib/python3.13/site-packages/huggingface_hub/errors.py +377 -0
- venv/lib/python3.13/site-packages/huggingface_hub/fastai_utils.py +415 -0
- venv/lib/python3.13/site-packages/huggingface_hub/file_download.py +1813 -0
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/METADATA
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: certifi
|
| 3 |
+
Version: 2025.11.12
|
| 4 |
+
Summary: Python package for providing Mozilla's CA Bundle.
|
| 5 |
+
Home-page: https://github.com/certifi/python-certifi
|
| 6 |
+
Author: Kenneth Reitz
|
| 7 |
+
Author-email: me@kennethreitz.com
|
| 8 |
+
License: MPL-2.0
|
| 9 |
+
Project-URL: Source, https://github.com/certifi/python-certifi
|
| 10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
| 13 |
+
Classifier: Natural Language :: English
|
| 14 |
+
Classifier: Programming Language :: Python
|
| 15 |
+
Classifier: Programming Language :: Python :: 3
|
| 16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 25 |
+
Requires-Python: >=3.7
|
| 26 |
+
License-File: LICENSE
|
| 27 |
+
Dynamic: author
|
| 28 |
+
Dynamic: author-email
|
| 29 |
+
Dynamic: classifier
|
| 30 |
+
Dynamic: description
|
| 31 |
+
Dynamic: home-page
|
| 32 |
+
Dynamic: license
|
| 33 |
+
Dynamic: license-file
|
| 34 |
+
Dynamic: project-url
|
| 35 |
+
Dynamic: requires-python
|
| 36 |
+
Dynamic: summary
|
| 37 |
+
|
| 38 |
+
Certifi: Python SSL Certificates
|
| 39 |
+
================================
|
| 40 |
+
|
| 41 |
+
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
| 42 |
+
validating the trustworthiness of SSL certificates while verifying the identity
|
| 43 |
+
of TLS hosts. It has been extracted from the `Requests`_ project.
|
| 44 |
+
|
| 45 |
+
Installation
|
| 46 |
+
------------
|
| 47 |
+
|
| 48 |
+
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
| 49 |
+
|
| 50 |
+
$ pip install certifi
|
| 51 |
+
|
| 52 |
+
Usage
|
| 53 |
+
-----
|
| 54 |
+
|
| 55 |
+
To reference the installed certificate authority (CA) bundle, you can use the
|
| 56 |
+
built-in function::
|
| 57 |
+
|
| 58 |
+
>>> import certifi
|
| 59 |
+
|
| 60 |
+
>>> certifi.where()
|
| 61 |
+
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
| 62 |
+
|
| 63 |
+
Or from the command line::
|
| 64 |
+
|
| 65 |
+
$ python -m certifi
|
| 66 |
+
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
| 67 |
+
|
| 68 |
+
Enjoy!
|
| 69 |
+
|
| 70 |
+
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
| 71 |
+
|
| 72 |
+
Addition/Removal of Certificates
|
| 73 |
+
--------------------------------
|
| 74 |
+
|
| 75 |
+
Certifi does not support any addition/removal or other modification of the
|
| 76 |
+
CA trust store content. This project is intended to provide a reliable and
|
| 77 |
+
highly portable root of trust to python deployments. Look to upstream projects
|
| 78 |
+
for methods to use alternate trust.
|
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/RECORD
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
certifi-2025.11.12.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
certifi-2025.11.12.dist-info/METADATA,sha256=_JprGu_1lWSdHlruRBKcorXnrfvBDhvX_6KRr8HQbLc,2475
|
| 3 |
+
certifi-2025.11.12.dist-info/RECORD,,
|
| 4 |
+
certifi-2025.11.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
| 5 |
+
certifi-2025.11.12.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
| 6 |
+
certifi-2025.11.12.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
| 7 |
+
certifi/__init__.py,sha256=1BRSxNMnZW7CZ2oJtYWLoJgfHfcB9i273exwiPwfjJM,94
|
| 8 |
+
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
| 9 |
+
certifi/__pycache__/__init__.cpython-313.pyc,,
|
| 10 |
+
certifi/__pycache__/__main__.cpython-313.pyc,,
|
| 11 |
+
certifi/__pycache__/core.cpython-313.pyc,,
|
| 12 |
+
certifi/cacert.pem,sha256=oa1dZD4hxDtb7XTH4IkdzbWPavUcis4eTwINZUqlKhY,283932
|
| 13 |
+
certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
|
| 14 |
+
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
| 5 |
+
|
venv/lib/python3.13/site-packages/certifi-2025.11.12.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
certifi
|
venv/lib/python3.13/site-packages/certifi/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .core import contents, where
|
| 2 |
+
|
| 3 |
+
__all__ = ["contents", "where"]
|
| 4 |
+
__version__ = "2025.11.12"
|
venv/lib/python3.13/site-packages/certifi/__main__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
from certifi import contents, where
|
| 4 |
+
|
| 5 |
+
parser = argparse.ArgumentParser()
|
| 6 |
+
parser.add_argument("-c", "--contents", action="store_true")
|
| 7 |
+
args = parser.parse_args()
|
| 8 |
+
|
| 9 |
+
if args.contents:
|
| 10 |
+
print(contents())
|
| 11 |
+
else:
|
| 12 |
+
print(where())
|
venv/lib/python3.13/site-packages/certifi/cacert.pem
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
venv/lib/python3.13/site-packages/certifi/core.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
certifi.py
|
| 3 |
+
~~~~~~~~~~
|
| 4 |
+
|
| 5 |
+
This module returns the installation location of cacert.pem or its contents.
|
| 6 |
+
"""
|
| 7 |
+
import sys
|
| 8 |
+
import atexit
|
| 9 |
+
|
| 10 |
+
def exit_cacert_ctx() -> None:
|
| 11 |
+
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
if sys.version_info >= (3, 11):
|
| 15 |
+
|
| 16 |
+
from importlib.resources import as_file, files
|
| 17 |
+
|
| 18 |
+
_CACERT_CTX = None
|
| 19 |
+
_CACERT_PATH = None
|
| 20 |
+
|
| 21 |
+
def where() -> str:
|
| 22 |
+
# This is slightly terrible, but we want to delay extracting the file
|
| 23 |
+
# in cases where we're inside of a zipimport situation until someone
|
| 24 |
+
# actually calls where(), but we don't want to re-extract the file
|
| 25 |
+
# on every call of where(), so we'll do it once then store it in a
|
| 26 |
+
# global variable.
|
| 27 |
+
global _CACERT_CTX
|
| 28 |
+
global _CACERT_PATH
|
| 29 |
+
if _CACERT_PATH is None:
|
| 30 |
+
# This is slightly janky, the importlib.resources API wants you to
|
| 31 |
+
# manage the cleanup of this file, so it doesn't actually return a
|
| 32 |
+
# path, it returns a context manager that will give you the path
|
| 33 |
+
# when you enter it and will do any cleanup when you leave it. In
|
| 34 |
+
# the common case of not needing a temporary file, it will just
|
| 35 |
+
# return the file system location and the __exit__() is a no-op.
|
| 36 |
+
#
|
| 37 |
+
# We also have to hold onto the actual context manager, because
|
| 38 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
| 39 |
+
# we will also store that at the global level as well.
|
| 40 |
+
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
| 41 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
| 42 |
+
atexit.register(exit_cacert_ctx)
|
| 43 |
+
|
| 44 |
+
return _CACERT_PATH
|
| 45 |
+
|
| 46 |
+
def contents() -> str:
|
| 47 |
+
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
| 48 |
+
|
| 49 |
+
else:
|
| 50 |
+
|
| 51 |
+
from importlib.resources import path as get_path, read_text
|
| 52 |
+
|
| 53 |
+
_CACERT_CTX = None
|
| 54 |
+
_CACERT_PATH = None
|
| 55 |
+
|
| 56 |
+
def where() -> str:
|
| 57 |
+
# This is slightly terrible, but we want to delay extracting the
|
| 58 |
+
# file in cases where we're inside of a zipimport situation until
|
| 59 |
+
# someone actually calls where(), but we don't want to re-extract
|
| 60 |
+
# the file on every call of where(), so we'll do it once then store
|
| 61 |
+
# it in a global variable.
|
| 62 |
+
global _CACERT_CTX
|
| 63 |
+
global _CACERT_PATH
|
| 64 |
+
if _CACERT_PATH is None:
|
| 65 |
+
# This is slightly janky, the importlib.resources API wants you
|
| 66 |
+
# to manage the cleanup of this file, so it doesn't actually
|
| 67 |
+
# return a path, it returns a context manager that will give
|
| 68 |
+
# you the path when you enter it and will do any cleanup when
|
| 69 |
+
# you leave it. In the common case of not needing a temporary
|
| 70 |
+
# file, it will just return the file system location and the
|
| 71 |
+
# __exit__() is a no-op.
|
| 72 |
+
#
|
| 73 |
+
# We also have to hold onto the actual context manager, because
|
| 74 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
| 75 |
+
# we will also store that at the global level as well.
|
| 76 |
+
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
| 77 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
| 78 |
+
atexit.register(exit_cacert_ctx)
|
| 79 |
+
|
| 80 |
+
return _CACERT_PATH
|
| 81 |
+
|
| 82 |
+
def contents() -> str:
|
| 83 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
venv/lib/python3.13/site-packages/certifi/py.typed
ADDED
|
File without changes
|
venv/lib/python3.13/site-packages/charset_normalizer/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Charset-Normalizer
|
| 3 |
+
~~~~~~~~~~~~~~
|
| 4 |
+
The Real First Universal Charset Detector.
|
| 5 |
+
A library that helps you read text from an unknown charset encoding.
|
| 6 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
| 7 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
| 8 |
+
|
| 9 |
+
Basic usage:
|
| 10 |
+
>>> from charset_normalizer import from_bytes
|
| 11 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
| 12 |
+
>>> best_guess = results.best()
|
| 13 |
+
>>> str(best_guess)
|
| 14 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
| 15 |
+
|
| 16 |
+
Others methods and usages are available - see the full documentation
|
| 17 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
| 18 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
| 19 |
+
:license: MIT, see LICENSE for more details.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import logging
|
| 25 |
+
|
| 26 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
| 27 |
+
from .legacy import detect
|
| 28 |
+
from .models import CharsetMatch, CharsetMatches
|
| 29 |
+
from .utils import set_logging_handler
|
| 30 |
+
from .version import VERSION, __version__
|
| 31 |
+
|
| 32 |
+
__all__ = (
|
| 33 |
+
"from_fp",
|
| 34 |
+
"from_path",
|
| 35 |
+
"from_bytes",
|
| 36 |
+
"is_binary",
|
| 37 |
+
"detect",
|
| 38 |
+
"CharsetMatch",
|
| 39 |
+
"CharsetMatches",
|
| 40 |
+
"__version__",
|
| 41 |
+
"VERSION",
|
| 42 |
+
"set_logging_handler",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Attach a NullHandler to the top level logger by default
|
| 46 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
| 47 |
+
|
| 48 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
venv/lib/python3.13/site-packages/charset_normalizer/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .cli import cli_detect
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
cli_detect()
|
venv/lib/python3.13/site-packages/charset_normalizer/api.py
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from os import PathLike
|
| 5 |
+
from typing import BinaryIO
|
| 6 |
+
|
| 7 |
+
from .cd import (
|
| 8 |
+
coherence_ratio,
|
| 9 |
+
encoding_languages,
|
| 10 |
+
mb_encoding_languages,
|
| 11 |
+
merge_coherence_ratios,
|
| 12 |
+
)
|
| 13 |
+
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
| 14 |
+
from .md import mess_ratio
|
| 15 |
+
from .models import CharsetMatch, CharsetMatches
|
| 16 |
+
from .utils import (
|
| 17 |
+
any_specified_encoding,
|
| 18 |
+
cut_sequence_chunks,
|
| 19 |
+
iana_name,
|
| 20 |
+
identify_sig_or_bom,
|
| 21 |
+
is_cp_similar,
|
| 22 |
+
is_multi_byte_encoding,
|
| 23 |
+
should_strip_sig_or_bom,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("charset_normalizer")
|
| 27 |
+
explain_handler = logging.StreamHandler()
|
| 28 |
+
explain_handler.setFormatter(
|
| 29 |
+
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def from_bytes(
|
| 34 |
+
sequences: bytes | bytearray,
|
| 35 |
+
steps: int = 5,
|
| 36 |
+
chunk_size: int = 512,
|
| 37 |
+
threshold: float = 0.2,
|
| 38 |
+
cp_isolation: list[str] | None = None,
|
| 39 |
+
cp_exclusion: list[str] | None = None,
|
| 40 |
+
preemptive_behaviour: bool = True,
|
| 41 |
+
explain: bool = False,
|
| 42 |
+
language_threshold: float = 0.1,
|
| 43 |
+
enable_fallback: bool = True,
|
| 44 |
+
) -> CharsetMatches:
|
| 45 |
+
"""
|
| 46 |
+
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
| 47 |
+
If there is no results, it is a strong indicator that the source is binary/not text.
|
| 48 |
+
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
| 49 |
+
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
| 50 |
+
|
| 51 |
+
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
| 52 |
+
but never take it for granted. Can improve the performance.
|
| 53 |
+
|
| 54 |
+
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
| 55 |
+
purpose.
|
| 56 |
+
|
| 57 |
+
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
| 58 |
+
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
| 59 |
+
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
| 60 |
+
Custom logging format and handler can be set manually.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
if not isinstance(sequences, (bytearray, bytes)):
|
| 64 |
+
raise TypeError(
|
| 65 |
+
"Expected object of type bytes or bytearray, got: {}".format(
|
| 66 |
+
type(sequences)
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if explain:
|
| 71 |
+
previous_logger_level: int = logger.level
|
| 72 |
+
logger.addHandler(explain_handler)
|
| 73 |
+
logger.setLevel(TRACE)
|
| 74 |
+
|
| 75 |
+
length: int = len(sequences)
|
| 76 |
+
|
| 77 |
+
if length == 0:
|
| 78 |
+
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
| 79 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 80 |
+
logger.removeHandler(explain_handler)
|
| 81 |
+
logger.setLevel(previous_logger_level or logging.WARNING)
|
| 82 |
+
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
| 83 |
+
|
| 84 |
+
if cp_isolation is not None:
|
| 85 |
+
logger.log(
|
| 86 |
+
TRACE,
|
| 87 |
+
"cp_isolation is set. use this flag for debugging purpose. "
|
| 88 |
+
"limited list of encoding allowed : %s.",
|
| 89 |
+
", ".join(cp_isolation),
|
| 90 |
+
)
|
| 91 |
+
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
| 92 |
+
else:
|
| 93 |
+
cp_isolation = []
|
| 94 |
+
|
| 95 |
+
if cp_exclusion is not None:
|
| 96 |
+
logger.log(
|
| 97 |
+
TRACE,
|
| 98 |
+
"cp_exclusion is set. use this flag for debugging purpose. "
|
| 99 |
+
"limited list of encoding excluded : %s.",
|
| 100 |
+
", ".join(cp_exclusion),
|
| 101 |
+
)
|
| 102 |
+
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
| 103 |
+
else:
|
| 104 |
+
cp_exclusion = []
|
| 105 |
+
|
| 106 |
+
if length <= (chunk_size * steps):
|
| 107 |
+
logger.log(
|
| 108 |
+
TRACE,
|
| 109 |
+
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
| 110 |
+
steps,
|
| 111 |
+
chunk_size,
|
| 112 |
+
length,
|
| 113 |
+
)
|
| 114 |
+
steps = 1
|
| 115 |
+
chunk_size = length
|
| 116 |
+
|
| 117 |
+
if steps > 1 and length / steps < chunk_size:
|
| 118 |
+
chunk_size = int(length / steps)
|
| 119 |
+
|
| 120 |
+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
| 121 |
+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
| 122 |
+
|
| 123 |
+
if is_too_small_sequence:
|
| 124 |
+
logger.log(
|
| 125 |
+
TRACE,
|
| 126 |
+
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
| 127 |
+
length
|
| 128 |
+
),
|
| 129 |
+
)
|
| 130 |
+
elif is_too_large_sequence:
|
| 131 |
+
logger.log(
|
| 132 |
+
TRACE,
|
| 133 |
+
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
| 134 |
+
length
|
| 135 |
+
),
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
prioritized_encodings: list[str] = []
|
| 139 |
+
|
| 140 |
+
specified_encoding: str | None = (
|
| 141 |
+
any_specified_encoding(sequences) if preemptive_behaviour else None
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if specified_encoding is not None:
|
| 145 |
+
prioritized_encodings.append(specified_encoding)
|
| 146 |
+
logger.log(
|
| 147 |
+
TRACE,
|
| 148 |
+
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
| 149 |
+
specified_encoding,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
tested: set[str] = set()
|
| 153 |
+
tested_but_hard_failure: list[str] = []
|
| 154 |
+
tested_but_soft_failure: list[str] = []
|
| 155 |
+
|
| 156 |
+
fallback_ascii: CharsetMatch | None = None
|
| 157 |
+
fallback_u8: CharsetMatch | None = None
|
| 158 |
+
fallback_specified: CharsetMatch | None = None
|
| 159 |
+
|
| 160 |
+
results: CharsetMatches = CharsetMatches()
|
| 161 |
+
|
| 162 |
+
early_stop_results: CharsetMatches = CharsetMatches()
|
| 163 |
+
|
| 164 |
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
| 165 |
+
|
| 166 |
+
if sig_encoding is not None:
|
| 167 |
+
prioritized_encodings.append(sig_encoding)
|
| 168 |
+
logger.log(
|
| 169 |
+
TRACE,
|
| 170 |
+
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
| 171 |
+
len(sig_payload),
|
| 172 |
+
sig_encoding,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
prioritized_encodings.append("ascii")
|
| 176 |
+
|
| 177 |
+
if "utf_8" not in prioritized_encodings:
|
| 178 |
+
prioritized_encodings.append("utf_8")
|
| 179 |
+
|
| 180 |
+
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
| 181 |
+
if cp_isolation and encoding_iana not in cp_isolation:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
if cp_exclusion and encoding_iana in cp_exclusion:
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
if encoding_iana in tested:
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
tested.add(encoding_iana)
|
| 191 |
+
|
| 192 |
+
decoded_payload: str | None = None
|
| 193 |
+
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
| 194 |
+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
| 195 |
+
encoding_iana
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
| 199 |
+
logger.log(
|
| 200 |
+
TRACE,
|
| 201 |
+
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
| 202 |
+
encoding_iana,
|
| 203 |
+
)
|
| 204 |
+
continue
|
| 205 |
+
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
| 206 |
+
logger.log(
|
| 207 |
+
TRACE,
|
| 208 |
+
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
| 209 |
+
encoding_iana,
|
| 210 |
+
)
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
| 215 |
+
except (ModuleNotFoundError, ImportError):
|
| 216 |
+
logger.log(
|
| 217 |
+
TRACE,
|
| 218 |
+
"Encoding %s does not provide an IncrementalDecoder",
|
| 219 |
+
encoding_iana,
|
| 220 |
+
)
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
if is_too_large_sequence and is_multi_byte_decoder is False:
|
| 225 |
+
str(
|
| 226 |
+
(
|
| 227 |
+
sequences[: int(50e4)]
|
| 228 |
+
if strip_sig_or_bom is False
|
| 229 |
+
else sequences[len(sig_payload) : int(50e4)]
|
| 230 |
+
),
|
| 231 |
+
encoding=encoding_iana,
|
| 232 |
+
)
|
| 233 |
+
else:
|
| 234 |
+
decoded_payload = str(
|
| 235 |
+
(
|
| 236 |
+
sequences
|
| 237 |
+
if strip_sig_or_bom is False
|
| 238 |
+
else sequences[len(sig_payload) :]
|
| 239 |
+
),
|
| 240 |
+
encoding=encoding_iana,
|
| 241 |
+
)
|
| 242 |
+
except (UnicodeDecodeError, LookupError) as e:
|
| 243 |
+
if not isinstance(e, LookupError):
|
| 244 |
+
logger.log(
|
| 245 |
+
TRACE,
|
| 246 |
+
"Code page %s does not fit given bytes sequence at ALL. %s",
|
| 247 |
+
encoding_iana,
|
| 248 |
+
str(e),
|
| 249 |
+
)
|
| 250 |
+
tested_but_hard_failure.append(encoding_iana)
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
similar_soft_failure_test: bool = False
|
| 254 |
+
|
| 255 |
+
for encoding_soft_failed in tested_but_soft_failure:
|
| 256 |
+
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
| 257 |
+
similar_soft_failure_test = True
|
| 258 |
+
break
|
| 259 |
+
|
| 260 |
+
if similar_soft_failure_test:
|
| 261 |
+
logger.log(
|
| 262 |
+
TRACE,
|
| 263 |
+
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
| 264 |
+
encoding_iana,
|
| 265 |
+
encoding_soft_failed,
|
| 266 |
+
)
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
r_ = range(
|
| 270 |
+
0 if not bom_or_sig_available else len(sig_payload),
|
| 271 |
+
length,
|
| 272 |
+
int(length / steps),
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
multi_byte_bonus: bool = (
|
| 276 |
+
is_multi_byte_decoder
|
| 277 |
+
and decoded_payload is not None
|
| 278 |
+
and len(decoded_payload) < length
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
if multi_byte_bonus:
|
| 282 |
+
logger.log(
|
| 283 |
+
TRACE,
|
| 284 |
+
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
| 285 |
+
"was encoded using n-bytes.",
|
| 286 |
+
encoding_iana,
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
max_chunk_gave_up: int = int(len(r_) / 4)
|
| 290 |
+
|
| 291 |
+
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
| 292 |
+
early_stop_count: int = 0
|
| 293 |
+
lazy_str_hard_failure = False
|
| 294 |
+
|
| 295 |
+
md_chunks: list[str] = []
|
| 296 |
+
md_ratios = []
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
for chunk in cut_sequence_chunks(
|
| 300 |
+
sequences,
|
| 301 |
+
encoding_iana,
|
| 302 |
+
r_,
|
| 303 |
+
chunk_size,
|
| 304 |
+
bom_or_sig_available,
|
| 305 |
+
strip_sig_or_bom,
|
| 306 |
+
sig_payload,
|
| 307 |
+
is_multi_byte_decoder,
|
| 308 |
+
decoded_payload,
|
| 309 |
+
):
|
| 310 |
+
md_chunks.append(chunk)
|
| 311 |
+
|
| 312 |
+
md_ratios.append(
|
| 313 |
+
mess_ratio(
|
| 314 |
+
chunk,
|
| 315 |
+
threshold,
|
| 316 |
+
explain is True and 1 <= len(cp_isolation) <= 2,
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
if md_ratios[-1] >= threshold:
|
| 321 |
+
early_stop_count += 1
|
| 322 |
+
|
| 323 |
+
if (early_stop_count >= max_chunk_gave_up) or (
|
| 324 |
+
bom_or_sig_available and strip_sig_or_bom is False
|
| 325 |
+
):
|
| 326 |
+
break
|
| 327 |
+
except (
|
| 328 |
+
UnicodeDecodeError
|
| 329 |
+
) as e: # Lazy str loading may have missed something there
|
| 330 |
+
logger.log(
|
| 331 |
+
TRACE,
|
| 332 |
+
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
| 333 |
+
encoding_iana,
|
| 334 |
+
str(e),
|
| 335 |
+
)
|
| 336 |
+
early_stop_count = max_chunk_gave_up
|
| 337 |
+
lazy_str_hard_failure = True
|
| 338 |
+
|
| 339 |
+
# We might want to check the sequence again with the whole content
|
| 340 |
+
# Only if initial MD tests passes
|
| 341 |
+
if (
|
| 342 |
+
not lazy_str_hard_failure
|
| 343 |
+
and is_too_large_sequence
|
| 344 |
+
and not is_multi_byte_decoder
|
| 345 |
+
):
|
| 346 |
+
try:
|
| 347 |
+
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
| 348 |
+
except UnicodeDecodeError as e:
|
| 349 |
+
logger.log(
|
| 350 |
+
TRACE,
|
| 351 |
+
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
| 352 |
+
encoding_iana,
|
| 353 |
+
str(e),
|
| 354 |
+
)
|
| 355 |
+
tested_but_hard_failure.append(encoding_iana)
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
| 359 |
+
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
| 360 |
+
tested_but_soft_failure.append(encoding_iana)
|
| 361 |
+
logger.log(
|
| 362 |
+
TRACE,
|
| 363 |
+
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
| 364 |
+
"Computed mean chaos is %f %%.",
|
| 365 |
+
encoding_iana,
|
| 366 |
+
early_stop_count,
|
| 367 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
| 368 |
+
)
|
| 369 |
+
# Preparing those fallbacks in case we got nothing.
|
| 370 |
+
if (
|
| 371 |
+
enable_fallback
|
| 372 |
+
and encoding_iana
|
| 373 |
+
in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
|
| 374 |
+
and not lazy_str_hard_failure
|
| 375 |
+
):
|
| 376 |
+
fallback_entry = CharsetMatch(
|
| 377 |
+
sequences,
|
| 378 |
+
encoding_iana,
|
| 379 |
+
threshold,
|
| 380 |
+
bom_or_sig_available,
|
| 381 |
+
[],
|
| 382 |
+
decoded_payload,
|
| 383 |
+
preemptive_declaration=specified_encoding,
|
| 384 |
+
)
|
| 385 |
+
if encoding_iana == specified_encoding:
|
| 386 |
+
fallback_specified = fallback_entry
|
| 387 |
+
elif encoding_iana == "ascii":
|
| 388 |
+
fallback_ascii = fallback_entry
|
| 389 |
+
else:
|
| 390 |
+
fallback_u8 = fallback_entry
|
| 391 |
+
continue
|
| 392 |
+
|
| 393 |
+
logger.log(
|
| 394 |
+
TRACE,
|
| 395 |
+
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
| 396 |
+
encoding_iana,
|
| 397 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
if not is_multi_byte_decoder:
|
| 401 |
+
target_languages: list[str] = encoding_languages(encoding_iana)
|
| 402 |
+
else:
|
| 403 |
+
target_languages = mb_encoding_languages(encoding_iana)
|
| 404 |
+
|
| 405 |
+
if target_languages:
|
| 406 |
+
logger.log(
|
| 407 |
+
TRACE,
|
| 408 |
+
"{} should target any language(s) of {}".format(
|
| 409 |
+
encoding_iana, str(target_languages)
|
| 410 |
+
),
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
cd_ratios = []
|
| 414 |
+
|
| 415 |
+
# We shall skip the CD when its about ASCII
|
| 416 |
+
# Most of the time its not relevant to run "language-detection" on it.
|
| 417 |
+
if encoding_iana != "ascii":
|
| 418 |
+
for chunk in md_chunks:
|
| 419 |
+
chunk_languages = coherence_ratio(
|
| 420 |
+
chunk,
|
| 421 |
+
language_threshold,
|
| 422 |
+
",".join(target_languages) if target_languages else None,
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
cd_ratios.append(chunk_languages)
|
| 426 |
+
|
| 427 |
+
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
| 428 |
+
|
| 429 |
+
if cd_ratios_merged:
|
| 430 |
+
logger.log(
|
| 431 |
+
TRACE,
|
| 432 |
+
"We detected language {} using {}".format(
|
| 433 |
+
cd_ratios_merged, encoding_iana
|
| 434 |
+
),
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
current_match = CharsetMatch(
|
| 438 |
+
sequences,
|
| 439 |
+
encoding_iana,
|
| 440 |
+
mean_mess_ratio,
|
| 441 |
+
bom_or_sig_available,
|
| 442 |
+
cd_ratios_merged,
|
| 443 |
+
(
|
| 444 |
+
decoded_payload
|
| 445 |
+
if (
|
| 446 |
+
is_too_large_sequence is False
|
| 447 |
+
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
| 448 |
+
)
|
| 449 |
+
else None
|
| 450 |
+
),
|
| 451 |
+
preemptive_declaration=specified_encoding,
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
results.append(current_match)
|
| 455 |
+
|
| 456 |
+
if (
|
| 457 |
+
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
| 458 |
+
and mean_mess_ratio < 0.1
|
| 459 |
+
):
|
| 460 |
+
# If md says nothing to worry about, then... stop immediately!
|
| 461 |
+
if mean_mess_ratio == 0.0:
|
| 462 |
+
logger.debug(
|
| 463 |
+
"Encoding detection: %s is most likely the one.",
|
| 464 |
+
current_match.encoding,
|
| 465 |
+
)
|
| 466 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 467 |
+
logger.removeHandler(explain_handler)
|
| 468 |
+
logger.setLevel(previous_logger_level)
|
| 469 |
+
return CharsetMatches([current_match])
|
| 470 |
+
|
| 471 |
+
early_stop_results.append(current_match)
|
| 472 |
+
|
| 473 |
+
if (
|
| 474 |
+
len(early_stop_results)
|
| 475 |
+
and (specified_encoding is None or specified_encoding in tested)
|
| 476 |
+
and "ascii" in tested
|
| 477 |
+
and "utf_8" in tested
|
| 478 |
+
):
|
| 479 |
+
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
| 480 |
+
logger.debug(
|
| 481 |
+
"Encoding detection: %s is most likely the one.",
|
| 482 |
+
probable_result.encoding,
|
| 483 |
+
)
|
| 484 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 485 |
+
logger.removeHandler(explain_handler)
|
| 486 |
+
logger.setLevel(previous_logger_level)
|
| 487 |
+
|
| 488 |
+
return CharsetMatches([probable_result])
|
| 489 |
+
|
| 490 |
+
if encoding_iana == sig_encoding:
|
| 491 |
+
logger.debug(
|
| 492 |
+
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
| 493 |
+
"the beginning of the sequence.",
|
| 494 |
+
encoding_iana,
|
| 495 |
+
)
|
| 496 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 497 |
+
logger.removeHandler(explain_handler)
|
| 498 |
+
logger.setLevel(previous_logger_level)
|
| 499 |
+
return CharsetMatches([results[encoding_iana]])
|
| 500 |
+
|
| 501 |
+
if len(results) == 0:
|
| 502 |
+
if fallback_u8 or fallback_ascii or fallback_specified:
|
| 503 |
+
logger.log(
|
| 504 |
+
TRACE,
|
| 505 |
+
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
if fallback_specified:
|
| 509 |
+
logger.debug(
|
| 510 |
+
"Encoding detection: %s will be used as a fallback match",
|
| 511 |
+
fallback_specified.encoding,
|
| 512 |
+
)
|
| 513 |
+
results.append(fallback_specified)
|
| 514 |
+
elif (
|
| 515 |
+
(fallback_u8 and fallback_ascii is None)
|
| 516 |
+
or (
|
| 517 |
+
fallback_u8
|
| 518 |
+
and fallback_ascii
|
| 519 |
+
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
| 520 |
+
)
|
| 521 |
+
or (fallback_u8 is not None)
|
| 522 |
+
):
|
| 523 |
+
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
| 524 |
+
results.append(fallback_u8)
|
| 525 |
+
elif fallback_ascii:
|
| 526 |
+
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
| 527 |
+
results.append(fallback_ascii)
|
| 528 |
+
|
| 529 |
+
if results:
|
| 530 |
+
logger.debug(
|
| 531 |
+
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
| 532 |
+
results.best().encoding, # type: ignore
|
| 533 |
+
len(results) - 1,
|
| 534 |
+
)
|
| 535 |
+
else:
|
| 536 |
+
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
| 537 |
+
|
| 538 |
+
if explain:
|
| 539 |
+
logger.removeHandler(explain_handler)
|
| 540 |
+
logger.setLevel(previous_logger_level)
|
| 541 |
+
|
| 542 |
+
return results
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def from_fp(
|
| 546 |
+
fp: BinaryIO,
|
| 547 |
+
steps: int = 5,
|
| 548 |
+
chunk_size: int = 512,
|
| 549 |
+
threshold: float = 0.20,
|
| 550 |
+
cp_isolation: list[str] | None = None,
|
| 551 |
+
cp_exclusion: list[str] | None = None,
|
| 552 |
+
preemptive_behaviour: bool = True,
|
| 553 |
+
explain: bool = False,
|
| 554 |
+
language_threshold: float = 0.1,
|
| 555 |
+
enable_fallback: bool = True,
|
| 556 |
+
) -> CharsetMatches:
|
| 557 |
+
"""
|
| 558 |
+
Same thing than the function from_bytes but using a file pointer that is already ready.
|
| 559 |
+
Will not close the file pointer.
|
| 560 |
+
"""
|
| 561 |
+
return from_bytes(
|
| 562 |
+
fp.read(),
|
| 563 |
+
steps,
|
| 564 |
+
chunk_size,
|
| 565 |
+
threshold,
|
| 566 |
+
cp_isolation,
|
| 567 |
+
cp_exclusion,
|
| 568 |
+
preemptive_behaviour,
|
| 569 |
+
explain,
|
| 570 |
+
language_threshold,
|
| 571 |
+
enable_fallback,
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
def from_path(
|
| 576 |
+
path: str | bytes | PathLike, # type: ignore[type-arg]
|
| 577 |
+
steps: int = 5,
|
| 578 |
+
chunk_size: int = 512,
|
| 579 |
+
threshold: float = 0.20,
|
| 580 |
+
cp_isolation: list[str] | None = None,
|
| 581 |
+
cp_exclusion: list[str] | None = None,
|
| 582 |
+
preemptive_behaviour: bool = True,
|
| 583 |
+
explain: bool = False,
|
| 584 |
+
language_threshold: float = 0.1,
|
| 585 |
+
enable_fallback: bool = True,
|
| 586 |
+
) -> CharsetMatches:
|
| 587 |
+
"""
|
| 588 |
+
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
| 589 |
+
Can raise IOError.
|
| 590 |
+
"""
|
| 591 |
+
with open(path, "rb") as fp:
|
| 592 |
+
return from_fp(
|
| 593 |
+
fp,
|
| 594 |
+
steps,
|
| 595 |
+
chunk_size,
|
| 596 |
+
threshold,
|
| 597 |
+
cp_isolation,
|
| 598 |
+
cp_exclusion,
|
| 599 |
+
preemptive_behaviour,
|
| 600 |
+
explain,
|
| 601 |
+
language_threshold,
|
| 602 |
+
enable_fallback,
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def is_binary(
|
| 607 |
+
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
| 608 |
+
steps: int = 5,
|
| 609 |
+
chunk_size: int = 512,
|
| 610 |
+
threshold: float = 0.20,
|
| 611 |
+
cp_isolation: list[str] | None = None,
|
| 612 |
+
cp_exclusion: list[str] | None = None,
|
| 613 |
+
preemptive_behaviour: bool = True,
|
| 614 |
+
explain: bool = False,
|
| 615 |
+
language_threshold: float = 0.1,
|
| 616 |
+
enable_fallback: bool = False,
|
| 617 |
+
) -> bool:
|
| 618 |
+
"""
|
| 619 |
+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
| 620 |
+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
| 621 |
+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
| 622 |
+
"""
|
| 623 |
+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
| 624 |
+
guesses = from_path(
|
| 625 |
+
fp_or_path_or_payload,
|
| 626 |
+
steps=steps,
|
| 627 |
+
chunk_size=chunk_size,
|
| 628 |
+
threshold=threshold,
|
| 629 |
+
cp_isolation=cp_isolation,
|
| 630 |
+
cp_exclusion=cp_exclusion,
|
| 631 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 632 |
+
explain=explain,
|
| 633 |
+
language_threshold=language_threshold,
|
| 634 |
+
enable_fallback=enable_fallback,
|
| 635 |
+
)
|
| 636 |
+
elif isinstance(
|
| 637 |
+
fp_or_path_or_payload,
|
| 638 |
+
(
|
| 639 |
+
bytes,
|
| 640 |
+
bytearray,
|
| 641 |
+
),
|
| 642 |
+
):
|
| 643 |
+
guesses = from_bytes(
|
| 644 |
+
fp_or_path_or_payload,
|
| 645 |
+
steps=steps,
|
| 646 |
+
chunk_size=chunk_size,
|
| 647 |
+
threshold=threshold,
|
| 648 |
+
cp_isolation=cp_isolation,
|
| 649 |
+
cp_exclusion=cp_exclusion,
|
| 650 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 651 |
+
explain=explain,
|
| 652 |
+
language_threshold=language_threshold,
|
| 653 |
+
enable_fallback=enable_fallback,
|
| 654 |
+
)
|
| 655 |
+
else:
|
| 656 |
+
guesses = from_fp(
|
| 657 |
+
fp_or_path_or_payload,
|
| 658 |
+
steps=steps,
|
| 659 |
+
chunk_size=chunk_size,
|
| 660 |
+
threshold=threshold,
|
| 661 |
+
cp_isolation=cp_isolation,
|
| 662 |
+
cp_exclusion=cp_exclusion,
|
| 663 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 664 |
+
explain=explain,
|
| 665 |
+
language_threshold=language_threshold,
|
| 666 |
+
enable_fallback=enable_fallback,
|
| 667 |
+
)
|
| 668 |
+
|
| 669 |
+
return not guesses
|
venv/lib/python3.13/site-packages/charset_normalizer/cd.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
from codecs import IncrementalDecoder
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Counter as TypeCounter
|
| 8 |
+
|
| 9 |
+
from .constant import (
|
| 10 |
+
FREQUENCIES,
|
| 11 |
+
KO_NAMES,
|
| 12 |
+
LANGUAGE_SUPPORTED_COUNT,
|
| 13 |
+
TOO_SMALL_SEQUENCE,
|
| 14 |
+
ZH_NAMES,
|
| 15 |
+
)
|
| 16 |
+
from .md import is_suspiciously_successive_range
|
| 17 |
+
from .models import CoherenceMatches
|
| 18 |
+
from .utils import (
|
| 19 |
+
is_accentuated,
|
| 20 |
+
is_latin,
|
| 21 |
+
is_multi_byte_encoding,
|
| 22 |
+
is_unicode_range_secondary,
|
| 23 |
+
unicode_range,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def encoding_unicode_range(iana_name: str) -> list[str]:
|
| 28 |
+
"""
|
| 29 |
+
Return associated unicode ranges in a single byte code page.
|
| 30 |
+
"""
|
| 31 |
+
if is_multi_byte_encoding(iana_name):
|
| 32 |
+
raise OSError("Function not supported on multi-byte code page")
|
| 33 |
+
|
| 34 |
+
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
| 35 |
+
|
| 36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
| 37 |
+
seen_ranges: dict[str, int] = {}
|
| 38 |
+
character_count: int = 0
|
| 39 |
+
|
| 40 |
+
for i in range(0x40, 0xFF):
|
| 41 |
+
chunk: str = p.decode(bytes([i]))
|
| 42 |
+
|
| 43 |
+
if chunk:
|
| 44 |
+
character_range: str | None = unicode_range(chunk)
|
| 45 |
+
|
| 46 |
+
if character_range is None:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
if is_unicode_range_secondary(character_range) is False:
|
| 50 |
+
if character_range not in seen_ranges:
|
| 51 |
+
seen_ranges[character_range] = 0
|
| 52 |
+
seen_ranges[character_range] += 1
|
| 53 |
+
character_count += 1
|
| 54 |
+
|
| 55 |
+
return sorted(
|
| 56 |
+
[
|
| 57 |
+
character_range
|
| 58 |
+
for character_range in seen_ranges
|
| 59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
| 60 |
+
]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def unicode_range_languages(primary_range: str) -> list[str]:
|
| 65 |
+
"""
|
| 66 |
+
Return inferred languages used with a unicode range.
|
| 67 |
+
"""
|
| 68 |
+
languages: list[str] = []
|
| 69 |
+
|
| 70 |
+
for language, characters in FREQUENCIES.items():
|
| 71 |
+
for character in characters:
|
| 72 |
+
if unicode_range(character) == primary_range:
|
| 73 |
+
languages.append(language)
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
return languages
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@lru_cache()
|
| 80 |
+
def encoding_languages(iana_name: str) -> list[str]:
|
| 81 |
+
"""
|
| 82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 83 |
+
This function does the correspondence.
|
| 84 |
+
"""
|
| 85 |
+
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
| 86 |
+
primary_range: str | None = None
|
| 87 |
+
|
| 88 |
+
for specified_range in unicode_ranges:
|
| 89 |
+
if "Latin" not in specified_range:
|
| 90 |
+
primary_range = specified_range
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
if primary_range is None:
|
| 94 |
+
return ["Latin Based"]
|
| 95 |
+
|
| 96 |
+
return unicode_range_languages(primary_range)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@lru_cache()
|
| 100 |
+
def mb_encoding_languages(iana_name: str) -> list[str]:
|
| 101 |
+
"""
|
| 102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 103 |
+
This function does the correspondence.
|
| 104 |
+
"""
|
| 105 |
+
if (
|
| 106 |
+
iana_name.startswith("shift_")
|
| 107 |
+
or iana_name.startswith("iso2022_jp")
|
| 108 |
+
or iana_name.startswith("euc_j")
|
| 109 |
+
or iana_name == "cp932"
|
| 110 |
+
):
|
| 111 |
+
return ["Japanese"]
|
| 112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
| 113 |
+
return ["Chinese"]
|
| 114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
| 115 |
+
return ["Korean"]
|
| 116 |
+
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
| 121 |
+
def get_target_features(language: str) -> tuple[bool, bool]:
|
| 122 |
+
"""
|
| 123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
| 124 |
+
"""
|
| 125 |
+
target_have_accents: bool = False
|
| 126 |
+
target_pure_latin: bool = True
|
| 127 |
+
|
| 128 |
+
for character in FREQUENCIES[language]:
|
| 129 |
+
if not target_have_accents and is_accentuated(character):
|
| 130 |
+
target_have_accents = True
|
| 131 |
+
if target_pure_latin and is_latin(character) is False:
|
| 132 |
+
target_pure_latin = False
|
| 133 |
+
|
| 134 |
+
return target_have_accents, target_pure_latin
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def alphabet_languages(
|
| 138 |
+
characters: list[str], ignore_non_latin: bool = False
|
| 139 |
+
) -> list[str]:
|
| 140 |
+
"""
|
| 141 |
+
Return associated languages associated to given characters.
|
| 142 |
+
"""
|
| 143 |
+
languages: list[tuple[str, float]] = []
|
| 144 |
+
|
| 145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
| 146 |
+
|
| 147 |
+
for language, language_characters in FREQUENCIES.items():
|
| 148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
| 149 |
+
|
| 150 |
+
if ignore_non_latin and target_pure_latin is False:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
if target_have_accents is False and source_have_accents:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
character_count: int = len(language_characters)
|
| 157 |
+
|
| 158 |
+
character_match_count: int = len(
|
| 159 |
+
[c for c in language_characters if c in characters]
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
ratio: float = character_match_count / character_count
|
| 163 |
+
|
| 164 |
+
if ratio >= 0.2:
|
| 165 |
+
languages.append((language, ratio))
|
| 166 |
+
|
| 167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
| 168 |
+
|
| 169 |
+
return [compatible_language[0] for compatible_language in languages]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def characters_popularity_compare(
|
| 173 |
+
language: str, ordered_characters: list[str]
|
| 174 |
+
) -> float:
|
| 175 |
+
"""
|
| 176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
| 177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
| 178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
| 179 |
+
"""
|
| 180 |
+
if language not in FREQUENCIES:
|
| 181 |
+
raise ValueError(f"{language} not available")
|
| 182 |
+
|
| 183 |
+
character_approved_count: int = 0
|
| 184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
| 185 |
+
|
| 186 |
+
ordered_characters_count: int = len(ordered_characters)
|
| 187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
| 188 |
+
|
| 189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
| 190 |
+
|
| 191 |
+
for character, character_rank in zip(
|
| 192 |
+
ordered_characters, range(0, ordered_characters_count)
|
| 193 |
+
):
|
| 194 |
+
if character not in FREQUENCIES_language_set:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
| 198 |
+
expected_projection_ratio: float = (
|
| 199 |
+
target_language_characters_count / ordered_characters_count
|
| 200 |
+
)
|
| 201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
| 202 |
+
|
| 203 |
+
if (
|
| 204 |
+
large_alphabet is False
|
| 205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
| 206 |
+
):
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
if (
|
| 210 |
+
large_alphabet is True
|
| 211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
| 212 |
+
< target_language_characters_count / 3
|
| 213 |
+
):
|
| 214 |
+
character_approved_count += 1
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
characters_before_source: list[str] = FREQUENCIES[language][
|
| 218 |
+
0:character_rank_in_language
|
| 219 |
+
]
|
| 220 |
+
characters_after_source: list[str] = FREQUENCIES[language][
|
| 221 |
+
character_rank_in_language:
|
| 222 |
+
]
|
| 223 |
+
characters_before: list[str] = ordered_characters[0:character_rank]
|
| 224 |
+
characters_after: list[str] = ordered_characters[character_rank:]
|
| 225 |
+
|
| 226 |
+
before_match_count: int = len(
|
| 227 |
+
set(characters_before) & set(characters_before_source)
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
after_match_count: int = len(
|
| 231 |
+
set(characters_after) & set(characters_after_source)
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
| 235 |
+
character_approved_count += 1
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
| 239 |
+
character_approved_count += 1
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
if (
|
| 243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
| 244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
| 245 |
+
):
|
| 246 |
+
character_approved_count += 1
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
return character_approved_count / len(ordered_characters)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
| 253 |
+
"""
|
| 254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
| 255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
| 256 |
+
One containing the latin letters and the other hebrew.
|
| 257 |
+
"""
|
| 258 |
+
layers: dict[str, str] = {}
|
| 259 |
+
|
| 260 |
+
for character in decoded_sequence:
|
| 261 |
+
if character.isalpha() is False:
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
character_range: str | None = unicode_range(character)
|
| 265 |
+
|
| 266 |
+
if character_range is None:
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
layer_target_range: str | None = None
|
| 270 |
+
|
| 271 |
+
for discovered_range in layers:
|
| 272 |
+
if (
|
| 273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
| 274 |
+
is False
|
| 275 |
+
):
|
| 276 |
+
layer_target_range = discovered_range
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
if layer_target_range is None:
|
| 280 |
+
layer_target_range = character_range
|
| 281 |
+
|
| 282 |
+
if layer_target_range not in layers:
|
| 283 |
+
layers[layer_target_range] = character.lower()
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
layers[layer_target_range] += character.lower()
|
| 287 |
+
|
| 288 |
+
return list(layers.values())
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
| 292 |
+
"""
|
| 293 |
+
This function merge results previously given by the function coherence_ratio.
|
| 294 |
+
The return type is the same as coherence_ratio.
|
| 295 |
+
"""
|
| 296 |
+
per_language_ratios: dict[str, list[float]] = {}
|
| 297 |
+
for result in results:
|
| 298 |
+
for sub_result in result:
|
| 299 |
+
language, ratio = sub_result
|
| 300 |
+
if language not in per_language_ratios:
|
| 301 |
+
per_language_ratios[language] = [ratio]
|
| 302 |
+
continue
|
| 303 |
+
per_language_ratios[language].append(ratio)
|
| 304 |
+
|
| 305 |
+
merge = [
|
| 306 |
+
(
|
| 307 |
+
language,
|
| 308 |
+
round(
|
| 309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
| 310 |
+
4,
|
| 311 |
+
),
|
| 312 |
+
)
|
| 313 |
+
for language in per_language_ratios
|
| 314 |
+
]
|
| 315 |
+
|
| 316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
| 320 |
+
"""
|
| 321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
| 322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
| 323 |
+
"""
|
| 324 |
+
index_results: dict[str, list[float]] = dict()
|
| 325 |
+
|
| 326 |
+
for result in results:
|
| 327 |
+
language, ratio = result
|
| 328 |
+
no_em_name: str = language.replace("—", "")
|
| 329 |
+
|
| 330 |
+
if no_em_name not in index_results:
|
| 331 |
+
index_results[no_em_name] = []
|
| 332 |
+
|
| 333 |
+
index_results[no_em_name].append(ratio)
|
| 334 |
+
|
| 335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
| 336 |
+
filtered_results: CoherenceMatches = []
|
| 337 |
+
|
| 338 |
+
for language in index_results:
|
| 339 |
+
filtered_results.append((language, max(index_results[language])))
|
| 340 |
+
|
| 341 |
+
return filtered_results
|
| 342 |
+
|
| 343 |
+
return results
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
@lru_cache(maxsize=2048)
|
| 347 |
+
def coherence_ratio(
|
| 348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
| 349 |
+
) -> CoherenceMatches:
|
| 350 |
+
"""
|
| 351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
| 352 |
+
A layer = Character extraction by alphabets/ranges.
|
| 353 |
+
"""
|
| 354 |
+
|
| 355 |
+
results: list[tuple[str, float]] = []
|
| 356 |
+
ignore_non_latin: bool = False
|
| 357 |
+
|
| 358 |
+
sufficient_match_count: int = 0
|
| 359 |
+
|
| 360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
| 361 |
+
if "Latin Based" in lg_inclusion_list:
|
| 362 |
+
ignore_non_latin = True
|
| 363 |
+
lg_inclusion_list.remove("Latin Based")
|
| 364 |
+
|
| 365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
| 366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
| 367 |
+
most_common = sequence_frequencies.most_common()
|
| 368 |
+
|
| 369 |
+
character_count: int = sum(o for c, o in most_common)
|
| 370 |
+
|
| 371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
| 372 |
+
continue
|
| 373 |
+
|
| 374 |
+
popular_character_ordered: list[str] = [c for c, o in most_common]
|
| 375 |
+
|
| 376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
| 377 |
+
popular_character_ordered, ignore_non_latin
|
| 378 |
+
):
|
| 379 |
+
ratio: float = characters_popularity_compare(
|
| 380 |
+
language, popular_character_ordered
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
if ratio < threshold:
|
| 384 |
+
continue
|
| 385 |
+
elif ratio >= 0.8:
|
| 386 |
+
sufficient_match_count += 1
|
| 387 |
+
|
| 388 |
+
results.append((language, round(ratio, 4)))
|
| 389 |
+
|
| 390 |
+
if sufficient_match_count >= 3:
|
| 391 |
+
break
|
| 392 |
+
|
| 393 |
+
return sorted(
|
| 394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
| 395 |
+
)
|
venv/lib/python3.13/site-packages/charset_normalizer/constant.py
ADDED
|
@@ -0,0 +1,2015 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
| 4 |
+
from encodings.aliases import aliases
|
| 5 |
+
from re import IGNORECASE
|
| 6 |
+
from re import compile as re_compile
|
| 7 |
+
|
| 8 |
+
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
| 9 |
+
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
|
| 10 |
+
"utf_8": BOM_UTF8,
|
| 11 |
+
"utf_7": [
|
| 12 |
+
b"\x2b\x2f\x76\x38",
|
| 13 |
+
b"\x2b\x2f\x76\x39",
|
| 14 |
+
b"\x2b\x2f\x76\x2b",
|
| 15 |
+
b"\x2b\x2f\x76\x2f",
|
| 16 |
+
b"\x2b\x2f\x76\x38\x2d",
|
| 17 |
+
],
|
| 18 |
+
"gb18030": b"\x84\x31\x95\x33",
|
| 19 |
+
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
| 20 |
+
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
TOO_SMALL_SEQUENCE: int = 32
|
| 24 |
+
TOO_BIG_SEQUENCE: int = int(10e6)
|
| 25 |
+
|
| 26 |
+
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
| 27 |
+
|
| 28 |
+
# Up-to-date Unicode ucd/15.0.0
|
| 29 |
+
UNICODE_RANGES_COMBINED: dict[str, range] = {
|
| 30 |
+
"Control character": range(32),
|
| 31 |
+
"Basic Latin": range(32, 128),
|
| 32 |
+
"Latin-1 Supplement": range(128, 256),
|
| 33 |
+
"Latin Extended-A": range(256, 384),
|
| 34 |
+
"Latin Extended-B": range(384, 592),
|
| 35 |
+
"IPA Extensions": range(592, 688),
|
| 36 |
+
"Spacing Modifier Letters": range(688, 768),
|
| 37 |
+
"Combining Diacritical Marks": range(768, 880),
|
| 38 |
+
"Greek and Coptic": range(880, 1024),
|
| 39 |
+
"Cyrillic": range(1024, 1280),
|
| 40 |
+
"Cyrillic Supplement": range(1280, 1328),
|
| 41 |
+
"Armenian": range(1328, 1424),
|
| 42 |
+
"Hebrew": range(1424, 1536),
|
| 43 |
+
"Arabic": range(1536, 1792),
|
| 44 |
+
"Syriac": range(1792, 1872),
|
| 45 |
+
"Arabic Supplement": range(1872, 1920),
|
| 46 |
+
"Thaana": range(1920, 1984),
|
| 47 |
+
"NKo": range(1984, 2048),
|
| 48 |
+
"Samaritan": range(2048, 2112),
|
| 49 |
+
"Mandaic": range(2112, 2144),
|
| 50 |
+
"Syriac Supplement": range(2144, 2160),
|
| 51 |
+
"Arabic Extended-B": range(2160, 2208),
|
| 52 |
+
"Arabic Extended-A": range(2208, 2304),
|
| 53 |
+
"Devanagari": range(2304, 2432),
|
| 54 |
+
"Bengali": range(2432, 2560),
|
| 55 |
+
"Gurmukhi": range(2560, 2688),
|
| 56 |
+
"Gujarati": range(2688, 2816),
|
| 57 |
+
"Oriya": range(2816, 2944),
|
| 58 |
+
"Tamil": range(2944, 3072),
|
| 59 |
+
"Telugu": range(3072, 3200),
|
| 60 |
+
"Kannada": range(3200, 3328),
|
| 61 |
+
"Malayalam": range(3328, 3456),
|
| 62 |
+
"Sinhala": range(3456, 3584),
|
| 63 |
+
"Thai": range(3584, 3712),
|
| 64 |
+
"Lao": range(3712, 3840),
|
| 65 |
+
"Tibetan": range(3840, 4096),
|
| 66 |
+
"Myanmar": range(4096, 4256),
|
| 67 |
+
"Georgian": range(4256, 4352),
|
| 68 |
+
"Hangul Jamo": range(4352, 4608),
|
| 69 |
+
"Ethiopic": range(4608, 4992),
|
| 70 |
+
"Ethiopic Supplement": range(4992, 5024),
|
| 71 |
+
"Cherokee": range(5024, 5120),
|
| 72 |
+
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
| 73 |
+
"Ogham": range(5760, 5792),
|
| 74 |
+
"Runic": range(5792, 5888),
|
| 75 |
+
"Tagalog": range(5888, 5920),
|
| 76 |
+
"Hanunoo": range(5920, 5952),
|
| 77 |
+
"Buhid": range(5952, 5984),
|
| 78 |
+
"Tagbanwa": range(5984, 6016),
|
| 79 |
+
"Khmer": range(6016, 6144),
|
| 80 |
+
"Mongolian": range(6144, 6320),
|
| 81 |
+
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
| 82 |
+
"Limbu": range(6400, 6480),
|
| 83 |
+
"Tai Le": range(6480, 6528),
|
| 84 |
+
"New Tai Lue": range(6528, 6624),
|
| 85 |
+
"Khmer Symbols": range(6624, 6656),
|
| 86 |
+
"Buginese": range(6656, 6688),
|
| 87 |
+
"Tai Tham": range(6688, 6832),
|
| 88 |
+
"Combining Diacritical Marks Extended": range(6832, 6912),
|
| 89 |
+
"Balinese": range(6912, 7040),
|
| 90 |
+
"Sundanese": range(7040, 7104),
|
| 91 |
+
"Batak": range(7104, 7168),
|
| 92 |
+
"Lepcha": range(7168, 7248),
|
| 93 |
+
"Ol Chiki": range(7248, 7296),
|
| 94 |
+
"Cyrillic Extended-C": range(7296, 7312),
|
| 95 |
+
"Georgian Extended": range(7312, 7360),
|
| 96 |
+
"Sundanese Supplement": range(7360, 7376),
|
| 97 |
+
"Vedic Extensions": range(7376, 7424),
|
| 98 |
+
"Phonetic Extensions": range(7424, 7552),
|
| 99 |
+
"Phonetic Extensions Supplement": range(7552, 7616),
|
| 100 |
+
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
| 101 |
+
"Latin Extended Additional": range(7680, 7936),
|
| 102 |
+
"Greek Extended": range(7936, 8192),
|
| 103 |
+
"General Punctuation": range(8192, 8304),
|
| 104 |
+
"Superscripts and Subscripts": range(8304, 8352),
|
| 105 |
+
"Currency Symbols": range(8352, 8400),
|
| 106 |
+
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
| 107 |
+
"Letterlike Symbols": range(8448, 8528),
|
| 108 |
+
"Number Forms": range(8528, 8592),
|
| 109 |
+
"Arrows": range(8592, 8704),
|
| 110 |
+
"Mathematical Operators": range(8704, 8960),
|
| 111 |
+
"Miscellaneous Technical": range(8960, 9216),
|
| 112 |
+
"Control Pictures": range(9216, 9280),
|
| 113 |
+
"Optical Character Recognition": range(9280, 9312),
|
| 114 |
+
"Enclosed Alphanumerics": range(9312, 9472),
|
| 115 |
+
"Box Drawing": range(9472, 9600),
|
| 116 |
+
"Block Elements": range(9600, 9632),
|
| 117 |
+
"Geometric Shapes": range(9632, 9728),
|
| 118 |
+
"Miscellaneous Symbols": range(9728, 9984),
|
| 119 |
+
"Dingbats": range(9984, 10176),
|
| 120 |
+
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
| 121 |
+
"Supplemental Arrows-A": range(10224, 10240),
|
| 122 |
+
"Braille Patterns": range(10240, 10496),
|
| 123 |
+
"Supplemental Arrows-B": range(10496, 10624),
|
| 124 |
+
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
| 125 |
+
"Supplemental Mathematical Operators": range(10752, 11008),
|
| 126 |
+
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
| 127 |
+
"Glagolitic": range(11264, 11360),
|
| 128 |
+
"Latin Extended-C": range(11360, 11392),
|
| 129 |
+
"Coptic": range(11392, 11520),
|
| 130 |
+
"Georgian Supplement": range(11520, 11568),
|
| 131 |
+
"Tifinagh": range(11568, 11648),
|
| 132 |
+
"Ethiopic Extended": range(11648, 11744),
|
| 133 |
+
"Cyrillic Extended-A": range(11744, 11776),
|
| 134 |
+
"Supplemental Punctuation": range(11776, 11904),
|
| 135 |
+
"CJK Radicals Supplement": range(11904, 12032),
|
| 136 |
+
"Kangxi Radicals": range(12032, 12256),
|
| 137 |
+
"Ideographic Description Characters": range(12272, 12288),
|
| 138 |
+
"CJK Symbols and Punctuation": range(12288, 12352),
|
| 139 |
+
"Hiragana": range(12352, 12448),
|
| 140 |
+
"Katakana": range(12448, 12544),
|
| 141 |
+
"Bopomofo": range(12544, 12592),
|
| 142 |
+
"Hangul Compatibility Jamo": range(12592, 12688),
|
| 143 |
+
"Kanbun": range(12688, 12704),
|
| 144 |
+
"Bopomofo Extended": range(12704, 12736),
|
| 145 |
+
"CJK Strokes": range(12736, 12784),
|
| 146 |
+
"Katakana Phonetic Extensions": range(12784, 12800),
|
| 147 |
+
"Enclosed CJK Letters and Months": range(12800, 13056),
|
| 148 |
+
"CJK Compatibility": range(13056, 13312),
|
| 149 |
+
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
| 150 |
+
"Yijing Hexagram Symbols": range(19904, 19968),
|
| 151 |
+
"CJK Unified Ideographs": range(19968, 40960),
|
| 152 |
+
"Yi Syllables": range(40960, 42128),
|
| 153 |
+
"Yi Radicals": range(42128, 42192),
|
| 154 |
+
"Lisu": range(42192, 42240),
|
| 155 |
+
"Vai": range(42240, 42560),
|
| 156 |
+
"Cyrillic Extended-B": range(42560, 42656),
|
| 157 |
+
"Bamum": range(42656, 42752),
|
| 158 |
+
"Modifier Tone Letters": range(42752, 42784),
|
| 159 |
+
"Latin Extended-D": range(42784, 43008),
|
| 160 |
+
"Syloti Nagri": range(43008, 43056),
|
| 161 |
+
"Common Indic Number Forms": range(43056, 43072),
|
| 162 |
+
"Phags-pa": range(43072, 43136),
|
| 163 |
+
"Saurashtra": range(43136, 43232),
|
| 164 |
+
"Devanagari Extended": range(43232, 43264),
|
| 165 |
+
"Kayah Li": range(43264, 43312),
|
| 166 |
+
"Rejang": range(43312, 43360),
|
| 167 |
+
"Hangul Jamo Extended-A": range(43360, 43392),
|
| 168 |
+
"Javanese": range(43392, 43488),
|
| 169 |
+
"Myanmar Extended-B": range(43488, 43520),
|
| 170 |
+
"Cham": range(43520, 43616),
|
| 171 |
+
"Myanmar Extended-A": range(43616, 43648),
|
| 172 |
+
"Tai Viet": range(43648, 43744),
|
| 173 |
+
"Meetei Mayek Extensions": range(43744, 43776),
|
| 174 |
+
"Ethiopic Extended-A": range(43776, 43824),
|
| 175 |
+
"Latin Extended-E": range(43824, 43888),
|
| 176 |
+
"Cherokee Supplement": range(43888, 43968),
|
| 177 |
+
"Meetei Mayek": range(43968, 44032),
|
| 178 |
+
"Hangul Syllables": range(44032, 55216),
|
| 179 |
+
"Hangul Jamo Extended-B": range(55216, 55296),
|
| 180 |
+
"High Surrogates": range(55296, 56192),
|
| 181 |
+
"High Private Use Surrogates": range(56192, 56320),
|
| 182 |
+
"Low Surrogates": range(56320, 57344),
|
| 183 |
+
"Private Use Area": range(57344, 63744),
|
| 184 |
+
"CJK Compatibility Ideographs": range(63744, 64256),
|
| 185 |
+
"Alphabetic Presentation Forms": range(64256, 64336),
|
| 186 |
+
"Arabic Presentation Forms-A": range(64336, 65024),
|
| 187 |
+
"Variation Selectors": range(65024, 65040),
|
| 188 |
+
"Vertical Forms": range(65040, 65056),
|
| 189 |
+
"Combining Half Marks": range(65056, 65072),
|
| 190 |
+
"CJK Compatibility Forms": range(65072, 65104),
|
| 191 |
+
"Small Form Variants": range(65104, 65136),
|
| 192 |
+
"Arabic Presentation Forms-B": range(65136, 65280),
|
| 193 |
+
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
| 194 |
+
"Specials": range(65520, 65536),
|
| 195 |
+
"Linear B Syllabary": range(65536, 65664),
|
| 196 |
+
"Linear B Ideograms": range(65664, 65792),
|
| 197 |
+
"Aegean Numbers": range(65792, 65856),
|
| 198 |
+
"Ancient Greek Numbers": range(65856, 65936),
|
| 199 |
+
"Ancient Symbols": range(65936, 66000),
|
| 200 |
+
"Phaistos Disc": range(66000, 66048),
|
| 201 |
+
"Lycian": range(66176, 66208),
|
| 202 |
+
"Carian": range(66208, 66272),
|
| 203 |
+
"Coptic Epact Numbers": range(66272, 66304),
|
| 204 |
+
"Old Italic": range(66304, 66352),
|
| 205 |
+
"Gothic": range(66352, 66384),
|
| 206 |
+
"Old Permic": range(66384, 66432),
|
| 207 |
+
"Ugaritic": range(66432, 66464),
|
| 208 |
+
"Old Persian": range(66464, 66528),
|
| 209 |
+
"Deseret": range(66560, 66640),
|
| 210 |
+
"Shavian": range(66640, 66688),
|
| 211 |
+
"Osmanya": range(66688, 66736),
|
| 212 |
+
"Osage": range(66736, 66816),
|
| 213 |
+
"Elbasan": range(66816, 66864),
|
| 214 |
+
"Caucasian Albanian": range(66864, 66928),
|
| 215 |
+
"Vithkuqi": range(66928, 67008),
|
| 216 |
+
"Linear A": range(67072, 67456),
|
| 217 |
+
"Latin Extended-F": range(67456, 67520),
|
| 218 |
+
"Cypriot Syllabary": range(67584, 67648),
|
| 219 |
+
"Imperial Aramaic": range(67648, 67680),
|
| 220 |
+
"Palmyrene": range(67680, 67712),
|
| 221 |
+
"Nabataean": range(67712, 67760),
|
| 222 |
+
"Hatran": range(67808, 67840),
|
| 223 |
+
"Phoenician": range(67840, 67872),
|
| 224 |
+
"Lydian": range(67872, 67904),
|
| 225 |
+
"Meroitic Hieroglyphs": range(67968, 68000),
|
| 226 |
+
"Meroitic Cursive": range(68000, 68096),
|
| 227 |
+
"Kharoshthi": range(68096, 68192),
|
| 228 |
+
"Old South Arabian": range(68192, 68224),
|
| 229 |
+
"Old North Arabian": range(68224, 68256),
|
| 230 |
+
"Manichaean": range(68288, 68352),
|
| 231 |
+
"Avestan": range(68352, 68416),
|
| 232 |
+
"Inscriptional Parthian": range(68416, 68448),
|
| 233 |
+
"Inscriptional Pahlavi": range(68448, 68480),
|
| 234 |
+
"Psalter Pahlavi": range(68480, 68528),
|
| 235 |
+
"Old Turkic": range(68608, 68688),
|
| 236 |
+
"Old Hungarian": range(68736, 68864),
|
| 237 |
+
"Hanifi Rohingya": range(68864, 68928),
|
| 238 |
+
"Rumi Numeral Symbols": range(69216, 69248),
|
| 239 |
+
"Yezidi": range(69248, 69312),
|
| 240 |
+
"Arabic Extended-C": range(69312, 69376),
|
| 241 |
+
"Old Sogdian": range(69376, 69424),
|
| 242 |
+
"Sogdian": range(69424, 69488),
|
| 243 |
+
"Old Uyghur": range(69488, 69552),
|
| 244 |
+
"Chorasmian": range(69552, 69600),
|
| 245 |
+
"Elymaic": range(69600, 69632),
|
| 246 |
+
"Brahmi": range(69632, 69760),
|
| 247 |
+
"Kaithi": range(69760, 69840),
|
| 248 |
+
"Sora Sompeng": range(69840, 69888),
|
| 249 |
+
"Chakma": range(69888, 69968),
|
| 250 |
+
"Mahajani": range(69968, 70016),
|
| 251 |
+
"Sharada": range(70016, 70112),
|
| 252 |
+
"Sinhala Archaic Numbers": range(70112, 70144),
|
| 253 |
+
"Khojki": range(70144, 70224),
|
| 254 |
+
"Multani": range(70272, 70320),
|
| 255 |
+
"Khudawadi": range(70320, 70400),
|
| 256 |
+
"Grantha": range(70400, 70528),
|
| 257 |
+
"Newa": range(70656, 70784),
|
| 258 |
+
"Tirhuta": range(70784, 70880),
|
| 259 |
+
"Siddham": range(71040, 71168),
|
| 260 |
+
"Modi": range(71168, 71264),
|
| 261 |
+
"Mongolian Supplement": range(71264, 71296),
|
| 262 |
+
"Takri": range(71296, 71376),
|
| 263 |
+
"Ahom": range(71424, 71504),
|
| 264 |
+
"Dogra": range(71680, 71760),
|
| 265 |
+
"Warang Citi": range(71840, 71936),
|
| 266 |
+
"Dives Akuru": range(71936, 72032),
|
| 267 |
+
"Nandinagari": range(72096, 72192),
|
| 268 |
+
"Zanabazar Square": range(72192, 72272),
|
| 269 |
+
"Soyombo": range(72272, 72368),
|
| 270 |
+
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
| 271 |
+
"Pau Cin Hau": range(72384, 72448),
|
| 272 |
+
"Devanagari Extended-A": range(72448, 72544),
|
| 273 |
+
"Bhaiksuki": range(72704, 72816),
|
| 274 |
+
"Marchen": range(72816, 72896),
|
| 275 |
+
"Masaram Gondi": range(72960, 73056),
|
| 276 |
+
"Gunjala Gondi": range(73056, 73136),
|
| 277 |
+
"Makasar": range(73440, 73472),
|
| 278 |
+
"Kawi": range(73472, 73568),
|
| 279 |
+
"Lisu Supplement": range(73648, 73664),
|
| 280 |
+
"Tamil Supplement": range(73664, 73728),
|
| 281 |
+
"Cuneiform": range(73728, 74752),
|
| 282 |
+
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
| 283 |
+
"Early Dynastic Cuneiform": range(74880, 75088),
|
| 284 |
+
"Cypro-Minoan": range(77712, 77824),
|
| 285 |
+
"Egyptian Hieroglyphs": range(77824, 78896),
|
| 286 |
+
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
| 287 |
+
"Anatolian Hieroglyphs": range(82944, 83584),
|
| 288 |
+
"Bamum Supplement": range(92160, 92736),
|
| 289 |
+
"Mro": range(92736, 92784),
|
| 290 |
+
"Tangsa": range(92784, 92880),
|
| 291 |
+
"Bassa Vah": range(92880, 92928),
|
| 292 |
+
"Pahawh Hmong": range(92928, 93072),
|
| 293 |
+
"Medefaidrin": range(93760, 93856),
|
| 294 |
+
"Miao": range(93952, 94112),
|
| 295 |
+
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
| 296 |
+
"Tangut": range(94208, 100352),
|
| 297 |
+
"Tangut Components": range(100352, 101120),
|
| 298 |
+
"Khitan Small Script": range(101120, 101632),
|
| 299 |
+
"Tangut Supplement": range(101632, 101760),
|
| 300 |
+
"Kana Extended-B": range(110576, 110592),
|
| 301 |
+
"Kana Supplement": range(110592, 110848),
|
| 302 |
+
"Kana Extended-A": range(110848, 110896),
|
| 303 |
+
"Small Kana Extension": range(110896, 110960),
|
| 304 |
+
"Nushu": range(110960, 111360),
|
| 305 |
+
"Duployan": range(113664, 113824),
|
| 306 |
+
"Shorthand Format Controls": range(113824, 113840),
|
| 307 |
+
"Znamenny Musical Notation": range(118528, 118736),
|
| 308 |
+
"Byzantine Musical Symbols": range(118784, 119040),
|
| 309 |
+
"Musical Symbols": range(119040, 119296),
|
| 310 |
+
"Ancient Greek Musical Notation": range(119296, 119376),
|
| 311 |
+
"Kaktovik Numerals": range(119488, 119520),
|
| 312 |
+
"Mayan Numerals": range(119520, 119552),
|
| 313 |
+
"Tai Xuan Jing Symbols": range(119552, 119648),
|
| 314 |
+
"Counting Rod Numerals": range(119648, 119680),
|
| 315 |
+
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
| 316 |
+
"Sutton SignWriting": range(120832, 121520),
|
| 317 |
+
"Latin Extended-G": range(122624, 122880),
|
| 318 |
+
"Glagolitic Supplement": range(122880, 122928),
|
| 319 |
+
"Cyrillic Extended-D": range(122928, 123024),
|
| 320 |
+
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
| 321 |
+
"Toto": range(123536, 123584),
|
| 322 |
+
"Wancho": range(123584, 123648),
|
| 323 |
+
"Nag Mundari": range(124112, 124160),
|
| 324 |
+
"Ethiopic Extended-B": range(124896, 124928),
|
| 325 |
+
"Mende Kikakui": range(124928, 125152),
|
| 326 |
+
"Adlam": range(125184, 125280),
|
| 327 |
+
"Indic Siyaq Numbers": range(126064, 126144),
|
| 328 |
+
"Ottoman Siyaq Numbers": range(126208, 126288),
|
| 329 |
+
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
| 330 |
+
"Mahjong Tiles": range(126976, 127024),
|
| 331 |
+
"Domino Tiles": range(127024, 127136),
|
| 332 |
+
"Playing Cards": range(127136, 127232),
|
| 333 |
+
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
| 334 |
+
"Enclosed Ideographic Supplement": range(127488, 127744),
|
| 335 |
+
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
| 336 |
+
"Emoticons range(Emoji)": range(128512, 128592),
|
| 337 |
+
"Ornamental Dingbats": range(128592, 128640),
|
| 338 |
+
"Transport and Map Symbols": range(128640, 128768),
|
| 339 |
+
"Alchemical Symbols": range(128768, 128896),
|
| 340 |
+
"Geometric Shapes Extended": range(128896, 129024),
|
| 341 |
+
"Supplemental Arrows-C": range(129024, 129280),
|
| 342 |
+
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
| 343 |
+
"Chess Symbols": range(129536, 129648),
|
| 344 |
+
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
| 345 |
+
"Symbols for Legacy Computing": range(129792, 130048),
|
| 346 |
+
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
| 347 |
+
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
| 348 |
+
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
| 349 |
+
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
| 350 |
+
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
| 351 |
+
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
| 352 |
+
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
| 353 |
+
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
| 354 |
+
"Tags": range(917504, 917632),
|
| 355 |
+
"Variation Selectors Supplement": range(917760, 918000),
|
| 356 |
+
"Supplementary Private Use Area-A": range(983040, 1048576),
|
| 357 |
+
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
|
| 362 |
+
"Supplement",
|
| 363 |
+
"Extended",
|
| 364 |
+
"Extensions",
|
| 365 |
+
"Modifier",
|
| 366 |
+
"Marks",
|
| 367 |
+
"Punctuation",
|
| 368 |
+
"Symbols",
|
| 369 |
+
"Forms",
|
| 370 |
+
"Operators",
|
| 371 |
+
"Miscellaneous",
|
| 372 |
+
"Drawing",
|
| 373 |
+
"Block",
|
| 374 |
+
"Shapes",
|
| 375 |
+
"Supplemental",
|
| 376 |
+
"Tags",
|
| 377 |
+
]
|
| 378 |
+
|
| 379 |
+
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
| 380 |
+
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
| 381 |
+
IGNORECASE,
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
IANA_NO_ALIASES = [
|
| 385 |
+
"cp720",
|
| 386 |
+
"cp737",
|
| 387 |
+
"cp856",
|
| 388 |
+
"cp874",
|
| 389 |
+
"cp875",
|
| 390 |
+
"cp1006",
|
| 391 |
+
"koi8_r",
|
| 392 |
+
"koi8_t",
|
| 393 |
+
"koi8_u",
|
| 394 |
+
]
|
| 395 |
+
|
| 396 |
+
IANA_SUPPORTED: list[str] = sorted(
|
| 397 |
+
filter(
|
| 398 |
+
lambda x: x.endswith("_codec") is False
|
| 399 |
+
and x not in {"rot_13", "tactis", "mbcs"},
|
| 400 |
+
list(set(aliases.values())) + IANA_NO_ALIASES,
|
| 401 |
+
)
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
| 405 |
+
|
| 406 |
+
# pre-computed code page that are similar using the function cp_similarity.
|
| 407 |
+
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
|
| 408 |
+
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
| 409 |
+
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
| 410 |
+
"cp1125": ["cp866"],
|
| 411 |
+
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
| 412 |
+
"cp1250": ["iso8859_2"],
|
| 413 |
+
"cp1251": ["kz1048", "ptcp154"],
|
| 414 |
+
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
| 415 |
+
"cp1253": ["iso8859_7"],
|
| 416 |
+
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
| 417 |
+
"cp1257": ["iso8859_13"],
|
| 418 |
+
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
| 419 |
+
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
| 420 |
+
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
| 421 |
+
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
| 422 |
+
"cp857": ["cp850", "cp858", "cp865"],
|
| 423 |
+
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
| 424 |
+
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
| 425 |
+
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
| 426 |
+
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
| 427 |
+
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
| 428 |
+
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
| 429 |
+
"cp866": ["cp1125"],
|
| 430 |
+
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
| 431 |
+
"iso8859_11": ["tis_620"],
|
| 432 |
+
"iso8859_13": ["cp1257"],
|
| 433 |
+
"iso8859_14": [
|
| 434 |
+
"iso8859_10",
|
| 435 |
+
"iso8859_15",
|
| 436 |
+
"iso8859_16",
|
| 437 |
+
"iso8859_3",
|
| 438 |
+
"iso8859_9",
|
| 439 |
+
"latin_1",
|
| 440 |
+
],
|
| 441 |
+
"iso8859_15": [
|
| 442 |
+
"cp1252",
|
| 443 |
+
"cp1254",
|
| 444 |
+
"iso8859_10",
|
| 445 |
+
"iso8859_14",
|
| 446 |
+
"iso8859_16",
|
| 447 |
+
"iso8859_3",
|
| 448 |
+
"iso8859_9",
|
| 449 |
+
"latin_1",
|
| 450 |
+
],
|
| 451 |
+
"iso8859_16": [
|
| 452 |
+
"iso8859_14",
|
| 453 |
+
"iso8859_15",
|
| 454 |
+
"iso8859_2",
|
| 455 |
+
"iso8859_3",
|
| 456 |
+
"iso8859_9",
|
| 457 |
+
"latin_1",
|
| 458 |
+
],
|
| 459 |
+
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
| 460 |
+
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
| 461 |
+
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
| 462 |
+
"iso8859_7": ["cp1253"],
|
| 463 |
+
"iso8859_9": [
|
| 464 |
+
"cp1252",
|
| 465 |
+
"cp1254",
|
| 466 |
+
"cp1258",
|
| 467 |
+
"iso8859_10",
|
| 468 |
+
"iso8859_14",
|
| 469 |
+
"iso8859_15",
|
| 470 |
+
"iso8859_16",
|
| 471 |
+
"iso8859_3",
|
| 472 |
+
"iso8859_4",
|
| 473 |
+
"latin_1",
|
| 474 |
+
],
|
| 475 |
+
"kz1048": ["cp1251", "ptcp154"],
|
| 476 |
+
"latin_1": [
|
| 477 |
+
"cp1252",
|
| 478 |
+
"cp1254",
|
| 479 |
+
"cp1258",
|
| 480 |
+
"iso8859_10",
|
| 481 |
+
"iso8859_14",
|
| 482 |
+
"iso8859_15",
|
| 483 |
+
"iso8859_16",
|
| 484 |
+
"iso8859_3",
|
| 485 |
+
"iso8859_4",
|
| 486 |
+
"iso8859_9",
|
| 487 |
+
],
|
| 488 |
+
"mac_iceland": ["mac_roman", "mac_turkish"],
|
| 489 |
+
"mac_roman": ["mac_iceland", "mac_turkish"],
|
| 490 |
+
"mac_turkish": ["mac_iceland", "mac_roman"],
|
| 491 |
+
"ptcp154": ["cp1251", "kz1048"],
|
| 492 |
+
"tis_620": ["iso8859_11"],
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
CHARDET_CORRESPONDENCE: dict[str, str] = {
|
| 497 |
+
"iso2022_kr": "ISO-2022-KR",
|
| 498 |
+
"iso2022_jp": "ISO-2022-JP",
|
| 499 |
+
"euc_kr": "EUC-KR",
|
| 500 |
+
"tis_620": "TIS-620",
|
| 501 |
+
"utf_32": "UTF-32",
|
| 502 |
+
"euc_jp": "EUC-JP",
|
| 503 |
+
"koi8_r": "KOI8-R",
|
| 504 |
+
"iso8859_1": "ISO-8859-1",
|
| 505 |
+
"iso8859_2": "ISO-8859-2",
|
| 506 |
+
"iso8859_5": "ISO-8859-5",
|
| 507 |
+
"iso8859_6": "ISO-8859-6",
|
| 508 |
+
"iso8859_7": "ISO-8859-7",
|
| 509 |
+
"iso8859_8": "ISO-8859-8",
|
| 510 |
+
"utf_16": "UTF-16",
|
| 511 |
+
"cp855": "IBM855",
|
| 512 |
+
"mac_cyrillic": "MacCyrillic",
|
| 513 |
+
"gb2312": "GB2312",
|
| 514 |
+
"gb18030": "GB18030",
|
| 515 |
+
"cp932": "CP932",
|
| 516 |
+
"cp866": "IBM866",
|
| 517 |
+
"utf_8": "utf-8",
|
| 518 |
+
"utf_8_sig": "UTF-8-SIG",
|
| 519 |
+
"shift_jis": "SHIFT_JIS",
|
| 520 |
+
"big5": "Big5",
|
| 521 |
+
"cp1250": "windows-1250",
|
| 522 |
+
"cp1251": "windows-1251",
|
| 523 |
+
"cp1252": "Windows-1252",
|
| 524 |
+
"cp1253": "windows-1253",
|
| 525 |
+
"cp1255": "windows-1255",
|
| 526 |
+
"cp1256": "windows-1256",
|
| 527 |
+
"cp1254": "Windows-1254",
|
| 528 |
+
"cp949": "CP949",
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
|
| 533 |
+
"<",
|
| 534 |
+
">",
|
| 535 |
+
"=",
|
| 536 |
+
":",
|
| 537 |
+
"/",
|
| 538 |
+
"&",
|
| 539 |
+
";",
|
| 540 |
+
"{",
|
| 541 |
+
"}",
|
| 542 |
+
"[",
|
| 543 |
+
"]",
|
| 544 |
+
",",
|
| 545 |
+
"|",
|
| 546 |
+
'"',
|
| 547 |
+
"-",
|
| 548 |
+
"(",
|
| 549 |
+
")",
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
# Sample character sets — replace with full lists if needed
|
| 553 |
+
COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
|
| 554 |
+
|
| 555 |
+
COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
|
| 556 |
+
|
| 557 |
+
COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
|
| 558 |
+
|
| 559 |
+
# Combine all into a set
|
| 560 |
+
COMMON_CJK_CHARACTERS = set(
|
| 561 |
+
"".join(
|
| 562 |
+
[
|
| 563 |
+
COMMON_CHINESE_CHARACTERS,
|
| 564 |
+
COMMON_JAPANESE_CHARACTERS,
|
| 565 |
+
COMMON_KOREAN_CHARACTERS,
|
| 566 |
+
]
|
| 567 |
+
)
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
|
| 571 |
+
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
| 572 |
+
|
| 573 |
+
# Logging LEVEL below DEBUG
|
| 574 |
+
TRACE: int = 5
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
# Language label that contain the em dash "—"
|
| 578 |
+
# character are to be considered alternative seq to origin
|
| 579 |
+
FREQUENCIES: dict[str, list[str]] = {
|
| 580 |
+
"English": [
|
| 581 |
+
"e",
|
| 582 |
+
"a",
|
| 583 |
+
"t",
|
| 584 |
+
"i",
|
| 585 |
+
"o",
|
| 586 |
+
"n",
|
| 587 |
+
"s",
|
| 588 |
+
"r",
|
| 589 |
+
"h",
|
| 590 |
+
"l",
|
| 591 |
+
"d",
|
| 592 |
+
"c",
|
| 593 |
+
"u",
|
| 594 |
+
"m",
|
| 595 |
+
"f",
|
| 596 |
+
"p",
|
| 597 |
+
"g",
|
| 598 |
+
"w",
|
| 599 |
+
"y",
|
| 600 |
+
"b",
|
| 601 |
+
"v",
|
| 602 |
+
"k",
|
| 603 |
+
"x",
|
| 604 |
+
"j",
|
| 605 |
+
"z",
|
| 606 |
+
"q",
|
| 607 |
+
],
|
| 608 |
+
"English—": [
|
| 609 |
+
"e",
|
| 610 |
+
"a",
|
| 611 |
+
"t",
|
| 612 |
+
"i",
|
| 613 |
+
"o",
|
| 614 |
+
"n",
|
| 615 |
+
"s",
|
| 616 |
+
"r",
|
| 617 |
+
"h",
|
| 618 |
+
"l",
|
| 619 |
+
"d",
|
| 620 |
+
"c",
|
| 621 |
+
"m",
|
| 622 |
+
"u",
|
| 623 |
+
"f",
|
| 624 |
+
"p",
|
| 625 |
+
"g",
|
| 626 |
+
"w",
|
| 627 |
+
"b",
|
| 628 |
+
"y",
|
| 629 |
+
"v",
|
| 630 |
+
"k",
|
| 631 |
+
"j",
|
| 632 |
+
"x",
|
| 633 |
+
"z",
|
| 634 |
+
"q",
|
| 635 |
+
],
|
| 636 |
+
"German": [
|
| 637 |
+
"e",
|
| 638 |
+
"n",
|
| 639 |
+
"i",
|
| 640 |
+
"r",
|
| 641 |
+
"s",
|
| 642 |
+
"t",
|
| 643 |
+
"a",
|
| 644 |
+
"d",
|
| 645 |
+
"h",
|
| 646 |
+
"u",
|
| 647 |
+
"l",
|
| 648 |
+
"g",
|
| 649 |
+
"o",
|
| 650 |
+
"c",
|
| 651 |
+
"m",
|
| 652 |
+
"b",
|
| 653 |
+
"f",
|
| 654 |
+
"k",
|
| 655 |
+
"w",
|
| 656 |
+
"z",
|
| 657 |
+
"p",
|
| 658 |
+
"v",
|
| 659 |
+
"ü",
|
| 660 |
+
"ä",
|
| 661 |
+
"ö",
|
| 662 |
+
"j",
|
| 663 |
+
],
|
| 664 |
+
"French": [
|
| 665 |
+
"e",
|
| 666 |
+
"a",
|
| 667 |
+
"s",
|
| 668 |
+
"n",
|
| 669 |
+
"i",
|
| 670 |
+
"t",
|
| 671 |
+
"r",
|
| 672 |
+
"l",
|
| 673 |
+
"u",
|
| 674 |
+
"o",
|
| 675 |
+
"d",
|
| 676 |
+
"c",
|
| 677 |
+
"p",
|
| 678 |
+
"m",
|
| 679 |
+
"é",
|
| 680 |
+
"v",
|
| 681 |
+
"g",
|
| 682 |
+
"f",
|
| 683 |
+
"b",
|
| 684 |
+
"h",
|
| 685 |
+
"q",
|
| 686 |
+
"à",
|
| 687 |
+
"x",
|
| 688 |
+
"è",
|
| 689 |
+
"y",
|
| 690 |
+
"j",
|
| 691 |
+
],
|
| 692 |
+
"Dutch": [
|
| 693 |
+
"e",
|
| 694 |
+
"n",
|
| 695 |
+
"a",
|
| 696 |
+
"i",
|
| 697 |
+
"r",
|
| 698 |
+
"t",
|
| 699 |
+
"o",
|
| 700 |
+
"d",
|
| 701 |
+
"s",
|
| 702 |
+
"l",
|
| 703 |
+
"g",
|
| 704 |
+
"h",
|
| 705 |
+
"v",
|
| 706 |
+
"m",
|
| 707 |
+
"u",
|
| 708 |
+
"k",
|
| 709 |
+
"c",
|
| 710 |
+
"p",
|
| 711 |
+
"b",
|
| 712 |
+
"w",
|
| 713 |
+
"j",
|
| 714 |
+
"z",
|
| 715 |
+
"f",
|
| 716 |
+
"y",
|
| 717 |
+
"x",
|
| 718 |
+
"ë",
|
| 719 |
+
],
|
| 720 |
+
"Italian": [
|
| 721 |
+
"e",
|
| 722 |
+
"i",
|
| 723 |
+
"a",
|
| 724 |
+
"o",
|
| 725 |
+
"n",
|
| 726 |
+
"l",
|
| 727 |
+
"t",
|
| 728 |
+
"r",
|
| 729 |
+
"s",
|
| 730 |
+
"c",
|
| 731 |
+
"d",
|
| 732 |
+
"u",
|
| 733 |
+
"p",
|
| 734 |
+
"m",
|
| 735 |
+
"g",
|
| 736 |
+
"v",
|
| 737 |
+
"f",
|
| 738 |
+
"b",
|
| 739 |
+
"z",
|
| 740 |
+
"h",
|
| 741 |
+
"q",
|
| 742 |
+
"è",
|
| 743 |
+
"à",
|
| 744 |
+
"k",
|
| 745 |
+
"y",
|
| 746 |
+
"ò",
|
| 747 |
+
],
|
| 748 |
+
"Polish": [
|
| 749 |
+
"a",
|
| 750 |
+
"i",
|
| 751 |
+
"o",
|
| 752 |
+
"e",
|
| 753 |
+
"n",
|
| 754 |
+
"r",
|
| 755 |
+
"z",
|
| 756 |
+
"w",
|
| 757 |
+
"s",
|
| 758 |
+
"c",
|
| 759 |
+
"t",
|
| 760 |
+
"k",
|
| 761 |
+
"y",
|
| 762 |
+
"d",
|
| 763 |
+
"p",
|
| 764 |
+
"m",
|
| 765 |
+
"u",
|
| 766 |
+
"l",
|
| 767 |
+
"j",
|
| 768 |
+
"ł",
|
| 769 |
+
"g",
|
| 770 |
+
"b",
|
| 771 |
+
"h",
|
| 772 |
+
"ą",
|
| 773 |
+
"ę",
|
| 774 |
+
"ó",
|
| 775 |
+
],
|
| 776 |
+
"Spanish": [
|
| 777 |
+
"e",
|
| 778 |
+
"a",
|
| 779 |
+
"o",
|
| 780 |
+
"n",
|
| 781 |
+
"s",
|
| 782 |
+
"r",
|
| 783 |
+
"i",
|
| 784 |
+
"l",
|
| 785 |
+
"d",
|
| 786 |
+
"t",
|
| 787 |
+
"c",
|
| 788 |
+
"u",
|
| 789 |
+
"m",
|
| 790 |
+
"p",
|
| 791 |
+
"b",
|
| 792 |
+
"g",
|
| 793 |
+
"v",
|
| 794 |
+
"f",
|
| 795 |
+
"y",
|
| 796 |
+
"ó",
|
| 797 |
+
"h",
|
| 798 |
+
"q",
|
| 799 |
+
"í",
|
| 800 |
+
"j",
|
| 801 |
+
"z",
|
| 802 |
+
"á",
|
| 803 |
+
],
|
| 804 |
+
"Russian": [
|
| 805 |
+
"о",
|
| 806 |
+
"а",
|
| 807 |
+
"е",
|
| 808 |
+
"и",
|
| 809 |
+
"н",
|
| 810 |
+
"с",
|
| 811 |
+
"т",
|
| 812 |
+
"р",
|
| 813 |
+
"в",
|
| 814 |
+
"л",
|
| 815 |
+
"к",
|
| 816 |
+
"м",
|
| 817 |
+
"д",
|
| 818 |
+
"п",
|
| 819 |
+
"у",
|
| 820 |
+
"г",
|
| 821 |
+
"я",
|
| 822 |
+
"ы",
|
| 823 |
+
"з",
|
| 824 |
+
"б",
|
| 825 |
+
"й",
|
| 826 |
+
"ь",
|
| 827 |
+
"ч",
|
| 828 |
+
"х",
|
| 829 |
+
"ж",
|
| 830 |
+
"ц",
|
| 831 |
+
],
|
| 832 |
+
# Jap-Kanji
|
| 833 |
+
"Japanese": [
|
| 834 |
+
"人",
|
| 835 |
+
"一",
|
| 836 |
+
"大",
|
| 837 |
+
"亅",
|
| 838 |
+
"丁",
|
| 839 |
+
"丨",
|
| 840 |
+
"竹",
|
| 841 |
+
"笑",
|
| 842 |
+
"口",
|
| 843 |
+
"日",
|
| 844 |
+
"今",
|
| 845 |
+
"二",
|
| 846 |
+
"彳",
|
| 847 |
+
"行",
|
| 848 |
+
"十",
|
| 849 |
+
"土",
|
| 850 |
+
"丶",
|
| 851 |
+
"寸",
|
| 852 |
+
"寺",
|
| 853 |
+
"時",
|
| 854 |
+
"乙",
|
| 855 |
+
"丿",
|
| 856 |
+
"乂",
|
| 857 |
+
"气",
|
| 858 |
+
"気",
|
| 859 |
+
"冂",
|
| 860 |
+
"巾",
|
| 861 |
+
"亠",
|
| 862 |
+
"市",
|
| 863 |
+
"目",
|
| 864 |
+
"儿",
|
| 865 |
+
"見",
|
| 866 |
+
"八",
|
| 867 |
+
"小",
|
| 868 |
+
"凵",
|
| 869 |
+
"県",
|
| 870 |
+
"月",
|
| 871 |
+
"彐",
|
| 872 |
+
"門",
|
| 873 |
+
"間",
|
| 874 |
+
"木",
|
| 875 |
+
"東",
|
| 876 |
+
"山",
|
| 877 |
+
"出",
|
| 878 |
+
"本",
|
| 879 |
+
"中",
|
| 880 |
+
"刀",
|
| 881 |
+
"分",
|
| 882 |
+
"耳",
|
| 883 |
+
"又",
|
| 884 |
+
"取",
|
| 885 |
+
"最",
|
| 886 |
+
"言",
|
| 887 |
+
"田",
|
| 888 |
+
"心",
|
| 889 |
+
"思",
|
| 890 |
+
"刂",
|
| 891 |
+
"前",
|
| 892 |
+
"京",
|
| 893 |
+
"尹",
|
| 894 |
+
"事",
|
| 895 |
+
"生",
|
| 896 |
+
"厶",
|
| 897 |
+
"云",
|
| 898 |
+
"会",
|
| 899 |
+
"未",
|
| 900 |
+
"来",
|
| 901 |
+
"白",
|
| 902 |
+
"冫",
|
| 903 |
+
"楽",
|
| 904 |
+
"灬",
|
| 905 |
+
"馬",
|
| 906 |
+
"尸",
|
| 907 |
+
"尺",
|
| 908 |
+
"駅",
|
| 909 |
+
"明",
|
| 910 |
+
"耂",
|
| 911 |
+
"者",
|
| 912 |
+
"了",
|
| 913 |
+
"阝",
|
| 914 |
+
"都",
|
| 915 |
+
"高",
|
| 916 |
+
"卜",
|
| 917 |
+
"占",
|
| 918 |
+
"厂",
|
| 919 |
+
"广",
|
| 920 |
+
"店",
|
| 921 |
+
"子",
|
| 922 |
+
"申",
|
| 923 |
+
"奄",
|
| 924 |
+
"亻",
|
| 925 |
+
"俺",
|
| 926 |
+
"上",
|
| 927 |
+
"方",
|
| 928 |
+
"冖",
|
| 929 |
+
"学",
|
| 930 |
+
"衣",
|
| 931 |
+
"艮",
|
| 932 |
+
"食",
|
| 933 |
+
"自",
|
| 934 |
+
],
|
| 935 |
+
# Jap-Katakana
|
| 936 |
+
"Japanese—": [
|
| 937 |
+
"ー",
|
| 938 |
+
"ン",
|
| 939 |
+
"ス",
|
| 940 |
+
"・",
|
| 941 |
+
"ル",
|
| 942 |
+
"ト",
|
| 943 |
+
"リ",
|
| 944 |
+
"イ",
|
| 945 |
+
"ア",
|
| 946 |
+
"ラ",
|
| 947 |
+
"ッ",
|
| 948 |
+
"ク",
|
| 949 |
+
"ド",
|
| 950 |
+
"シ",
|
| 951 |
+
"レ",
|
| 952 |
+
"ジ",
|
| 953 |
+
"タ",
|
| 954 |
+
"フ",
|
| 955 |
+
"ロ",
|
| 956 |
+
"カ",
|
| 957 |
+
"テ",
|
| 958 |
+
"マ",
|
| 959 |
+
"ィ",
|
| 960 |
+
"グ",
|
| 961 |
+
"バ",
|
| 962 |
+
"ム",
|
| 963 |
+
"プ",
|
| 964 |
+
"オ",
|
| 965 |
+
"コ",
|
| 966 |
+
"デ",
|
| 967 |
+
"ニ",
|
| 968 |
+
"ウ",
|
| 969 |
+
"メ",
|
| 970 |
+
"サ",
|
| 971 |
+
"ビ",
|
| 972 |
+
"ナ",
|
| 973 |
+
"ブ",
|
| 974 |
+
"ャ",
|
| 975 |
+
"エ",
|
| 976 |
+
"ュ",
|
| 977 |
+
"チ",
|
| 978 |
+
"キ",
|
| 979 |
+
"ズ",
|
| 980 |
+
"ダ",
|
| 981 |
+
"パ",
|
| 982 |
+
"ミ",
|
| 983 |
+
"ェ",
|
| 984 |
+
"ョ",
|
| 985 |
+
"ハ",
|
| 986 |
+
"セ",
|
| 987 |
+
"ベ",
|
| 988 |
+
"ガ",
|
| 989 |
+
"モ",
|
| 990 |
+
"ツ",
|
| 991 |
+
"ネ",
|
| 992 |
+
"ボ",
|
| 993 |
+
"ソ",
|
| 994 |
+
"ノ",
|
| 995 |
+
"ァ",
|
| 996 |
+
"ヴ",
|
| 997 |
+
"ワ",
|
| 998 |
+
"ポ",
|
| 999 |
+
"ペ",
|
| 1000 |
+
"ピ",
|
| 1001 |
+
"ケ",
|
| 1002 |
+
"ゴ",
|
| 1003 |
+
"ギ",
|
| 1004 |
+
"ザ",
|
| 1005 |
+
"ホ",
|
| 1006 |
+
"ゲ",
|
| 1007 |
+
"ォ",
|
| 1008 |
+
"ヤ",
|
| 1009 |
+
"ヒ",
|
| 1010 |
+
"ユ",
|
| 1011 |
+
"ヨ",
|
| 1012 |
+
"ヘ",
|
| 1013 |
+
"ゼ",
|
| 1014 |
+
"ヌ",
|
| 1015 |
+
"ゥ",
|
| 1016 |
+
"ゾ",
|
| 1017 |
+
"ヶ",
|
| 1018 |
+
"ヂ",
|
| 1019 |
+
"ヲ",
|
| 1020 |
+
"ヅ",
|
| 1021 |
+
"ヵ",
|
| 1022 |
+
"ヱ",
|
| 1023 |
+
"ヰ",
|
| 1024 |
+
"ヮ",
|
| 1025 |
+
"ヽ",
|
| 1026 |
+
"゠",
|
| 1027 |
+
"ヾ",
|
| 1028 |
+
"ヷ",
|
| 1029 |
+
"ヿ",
|
| 1030 |
+
"ヸ",
|
| 1031 |
+
"ヹ",
|
| 1032 |
+
"ヺ",
|
| 1033 |
+
],
|
| 1034 |
+
# Jap-Hiragana
|
| 1035 |
+
"Japanese——": [
|
| 1036 |
+
"の",
|
| 1037 |
+
"に",
|
| 1038 |
+
"る",
|
| 1039 |
+
"た",
|
| 1040 |
+
"と",
|
| 1041 |
+
"は",
|
| 1042 |
+
"し",
|
| 1043 |
+
"い",
|
| 1044 |
+
"を",
|
| 1045 |
+
"で",
|
| 1046 |
+
"て",
|
| 1047 |
+
"が",
|
| 1048 |
+
"な",
|
| 1049 |
+
"れ",
|
| 1050 |
+
"か",
|
| 1051 |
+
"ら",
|
| 1052 |
+
"さ",
|
| 1053 |
+
"っ",
|
| 1054 |
+
"り",
|
| 1055 |
+
"す",
|
| 1056 |
+
"あ",
|
| 1057 |
+
"も",
|
| 1058 |
+
"こ",
|
| 1059 |
+
"ま",
|
| 1060 |
+
"う",
|
| 1061 |
+
"く",
|
| 1062 |
+
"よ",
|
| 1063 |
+
"き",
|
| 1064 |
+
"ん",
|
| 1065 |
+
"め",
|
| 1066 |
+
"お",
|
| 1067 |
+
"け",
|
| 1068 |
+
"そ",
|
| 1069 |
+
"つ",
|
| 1070 |
+
"だ",
|
| 1071 |
+
"や",
|
| 1072 |
+
"え",
|
| 1073 |
+
"ど",
|
| 1074 |
+
"わ",
|
| 1075 |
+
"ち",
|
| 1076 |
+
"み",
|
| 1077 |
+
"せ",
|
| 1078 |
+
"じ",
|
| 1079 |
+
"ば",
|
| 1080 |
+
"へ",
|
| 1081 |
+
"び",
|
| 1082 |
+
"ず",
|
| 1083 |
+
"ろ",
|
| 1084 |
+
"ほ",
|
| 1085 |
+
"げ",
|
| 1086 |
+
"む",
|
| 1087 |
+
"べ",
|
| 1088 |
+
"ひ",
|
| 1089 |
+
"ょ",
|
| 1090 |
+
"ゆ",
|
| 1091 |
+
"ぶ",
|
| 1092 |
+
"ご",
|
| 1093 |
+
"ゃ",
|
| 1094 |
+
"ね",
|
| 1095 |
+
"ふ",
|
| 1096 |
+
"ぐ",
|
| 1097 |
+
"ぎ",
|
| 1098 |
+
"ぼ",
|
| 1099 |
+
"ゅ",
|
| 1100 |
+
"づ",
|
| 1101 |
+
"ざ",
|
| 1102 |
+
"ぞ",
|
| 1103 |
+
"ぬ",
|
| 1104 |
+
"ぜ",
|
| 1105 |
+
"ぱ",
|
| 1106 |
+
"ぽ",
|
| 1107 |
+
"ぷ",
|
| 1108 |
+
"ぴ",
|
| 1109 |
+
"ぃ",
|
| 1110 |
+
"ぁ",
|
| 1111 |
+
"ぇ",
|
| 1112 |
+
"ぺ",
|
| 1113 |
+
"ゞ",
|
| 1114 |
+
"ぢ",
|
| 1115 |
+
"ぉ",
|
| 1116 |
+
"ぅ",
|
| 1117 |
+
"ゐ",
|
| 1118 |
+
"ゝ",
|
| 1119 |
+
"ゑ",
|
| 1120 |
+
"゛",
|
| 1121 |
+
"゜",
|
| 1122 |
+
"ゎ",
|
| 1123 |
+
"ゔ",
|
| 1124 |
+
"゚",
|
| 1125 |
+
"ゟ",
|
| 1126 |
+
"゙",
|
| 1127 |
+
"ゕ",
|
| 1128 |
+
"ゖ",
|
| 1129 |
+
],
|
| 1130 |
+
"Portuguese": [
|
| 1131 |
+
"a",
|
| 1132 |
+
"e",
|
| 1133 |
+
"o",
|
| 1134 |
+
"s",
|
| 1135 |
+
"i",
|
| 1136 |
+
"r",
|
| 1137 |
+
"d",
|
| 1138 |
+
"n",
|
| 1139 |
+
"t",
|
| 1140 |
+
"m",
|
| 1141 |
+
"u",
|
| 1142 |
+
"c",
|
| 1143 |
+
"l",
|
| 1144 |
+
"p",
|
| 1145 |
+
"g",
|
| 1146 |
+
"v",
|
| 1147 |
+
"b",
|
| 1148 |
+
"f",
|
| 1149 |
+
"h",
|
| 1150 |
+
"ã",
|
| 1151 |
+
"q",
|
| 1152 |
+
"é",
|
| 1153 |
+
"ç",
|
| 1154 |
+
"á",
|
| 1155 |
+
"z",
|
| 1156 |
+
"í",
|
| 1157 |
+
],
|
| 1158 |
+
"Swedish": [
|
| 1159 |
+
"e",
|
| 1160 |
+
"a",
|
| 1161 |
+
"n",
|
| 1162 |
+
"r",
|
| 1163 |
+
"t",
|
| 1164 |
+
"s",
|
| 1165 |
+
"i",
|
| 1166 |
+
"l",
|
| 1167 |
+
"d",
|
| 1168 |
+
"o",
|
| 1169 |
+
"m",
|
| 1170 |
+
"k",
|
| 1171 |
+
"g",
|
| 1172 |
+
"v",
|
| 1173 |
+
"h",
|
| 1174 |
+
"f",
|
| 1175 |
+
"u",
|
| 1176 |
+
"p",
|
| 1177 |
+
"ä",
|
| 1178 |
+
"c",
|
| 1179 |
+
"b",
|
| 1180 |
+
"ö",
|
| 1181 |
+
"å",
|
| 1182 |
+
"y",
|
| 1183 |
+
"j",
|
| 1184 |
+
"x",
|
| 1185 |
+
],
|
| 1186 |
+
"Chinese": [
|
| 1187 |
+
"的",
|
| 1188 |
+
"一",
|
| 1189 |
+
"是",
|
| 1190 |
+
"不",
|
| 1191 |
+
"了",
|
| 1192 |
+
"在",
|
| 1193 |
+
"人",
|
| 1194 |
+
"有",
|
| 1195 |
+
"我",
|
| 1196 |
+
"他",
|
| 1197 |
+
"这",
|
| 1198 |
+
"个",
|
| 1199 |
+
"们",
|
| 1200 |
+
"中",
|
| 1201 |
+
"来",
|
| 1202 |
+
"上",
|
| 1203 |
+
"大",
|
| 1204 |
+
"为",
|
| 1205 |
+
"和",
|
| 1206 |
+
"国",
|
| 1207 |
+
"地",
|
| 1208 |
+
"到",
|
| 1209 |
+
"以",
|
| 1210 |
+
"说",
|
| 1211 |
+
"时",
|
| 1212 |
+
"要",
|
| 1213 |
+
"就",
|
| 1214 |
+
"出",
|
| 1215 |
+
"会",
|
| 1216 |
+
"可",
|
| 1217 |
+
"也",
|
| 1218 |
+
"你",
|
| 1219 |
+
"对",
|
| 1220 |
+
"生",
|
| 1221 |
+
"能",
|
| 1222 |
+
"而",
|
| 1223 |
+
"子",
|
| 1224 |
+
"那",
|
| 1225 |
+
"得",
|
| 1226 |
+
"于",
|
| 1227 |
+
"着",
|
| 1228 |
+
"下",
|
| 1229 |
+
"自",
|
| 1230 |
+
"之",
|
| 1231 |
+
"年",
|
| 1232 |
+
"过",
|
| 1233 |
+
"发",
|
| 1234 |
+
"后",
|
| 1235 |
+
"作",
|
| 1236 |
+
"里",
|
| 1237 |
+
"用",
|
| 1238 |
+
"道",
|
| 1239 |
+
"行",
|
| 1240 |
+
"所",
|
| 1241 |
+
"然",
|
| 1242 |
+
"家",
|
| 1243 |
+
"种",
|
| 1244 |
+
"事",
|
| 1245 |
+
"成",
|
| 1246 |
+
"方",
|
| 1247 |
+
"多",
|
| 1248 |
+
"经",
|
| 1249 |
+
"么",
|
| 1250 |
+
"去",
|
| 1251 |
+
"法",
|
| 1252 |
+
"学",
|
| 1253 |
+
"如",
|
| 1254 |
+
"都",
|
| 1255 |
+
"同",
|
| 1256 |
+
"现",
|
| 1257 |
+
"当",
|
| 1258 |
+
"没",
|
| 1259 |
+
"动",
|
| 1260 |
+
"面",
|
| 1261 |
+
"起",
|
| 1262 |
+
"看",
|
| 1263 |
+
"定",
|
| 1264 |
+
"天",
|
| 1265 |
+
"分",
|
| 1266 |
+
"还",
|
| 1267 |
+
"进",
|
| 1268 |
+
"好",
|
| 1269 |
+
"小",
|
| 1270 |
+
"部",
|
| 1271 |
+
"其",
|
| 1272 |
+
"些",
|
| 1273 |
+
"主",
|
| 1274 |
+
"样",
|
| 1275 |
+
"理",
|
| 1276 |
+
"心",
|
| 1277 |
+
"她",
|
| 1278 |
+
"本",
|
| 1279 |
+
"前",
|
| 1280 |
+
"开",
|
| 1281 |
+
"但",
|
| 1282 |
+
"因",
|
| 1283 |
+
"只",
|
| 1284 |
+
"从",
|
| 1285 |
+
"想",
|
| 1286 |
+
"实",
|
| 1287 |
+
],
|
| 1288 |
+
"Ukrainian": [
|
| 1289 |
+
"о",
|
| 1290 |
+
"а",
|
| 1291 |
+
"н",
|
| 1292 |
+
"і",
|
| 1293 |
+
"и",
|
| 1294 |
+
"р",
|
| 1295 |
+
"в",
|
| 1296 |
+
"т",
|
| 1297 |
+
"е",
|
| 1298 |
+
"с",
|
| 1299 |
+
"к",
|
| 1300 |
+
"л",
|
| 1301 |
+
"у",
|
| 1302 |
+
"д",
|
| 1303 |
+
"м",
|
| 1304 |
+
"п",
|
| 1305 |
+
"з",
|
| 1306 |
+
"я",
|
| 1307 |
+
"ь",
|
| 1308 |
+
"б",
|
| 1309 |
+
"г",
|
| 1310 |
+
"й",
|
| 1311 |
+
"ч",
|
| 1312 |
+
"х",
|
| 1313 |
+
"ц",
|
| 1314 |
+
"ї",
|
| 1315 |
+
],
|
| 1316 |
+
"Norwegian": [
|
| 1317 |
+
"e",
|
| 1318 |
+
"r",
|
| 1319 |
+
"n",
|
| 1320 |
+
"t",
|
| 1321 |
+
"a",
|
| 1322 |
+
"s",
|
| 1323 |
+
"i",
|
| 1324 |
+
"o",
|
| 1325 |
+
"l",
|
| 1326 |
+
"d",
|
| 1327 |
+
"g",
|
| 1328 |
+
"k",
|
| 1329 |
+
"m",
|
| 1330 |
+
"v",
|
| 1331 |
+
"f",
|
| 1332 |
+
"p",
|
| 1333 |
+
"u",
|
| 1334 |
+
"b",
|
| 1335 |
+
"h",
|
| 1336 |
+
"å",
|
| 1337 |
+
"y",
|
| 1338 |
+
"j",
|
| 1339 |
+
"ø",
|
| 1340 |
+
"c",
|
| 1341 |
+
"æ",
|
| 1342 |
+
"w",
|
| 1343 |
+
],
|
| 1344 |
+
"Finnish": [
|
| 1345 |
+
"a",
|
| 1346 |
+
"i",
|
| 1347 |
+
"n",
|
| 1348 |
+
"t",
|
| 1349 |
+
"e",
|
| 1350 |
+
"s",
|
| 1351 |
+
"l",
|
| 1352 |
+
"o",
|
| 1353 |
+
"u",
|
| 1354 |
+
"k",
|
| 1355 |
+
"ä",
|
| 1356 |
+
"m",
|
| 1357 |
+
"r",
|
| 1358 |
+
"v",
|
| 1359 |
+
"j",
|
| 1360 |
+
"h",
|
| 1361 |
+
"p",
|
| 1362 |
+
"y",
|
| 1363 |
+
"d",
|
| 1364 |
+
"ö",
|
| 1365 |
+
"g",
|
| 1366 |
+
"c",
|
| 1367 |
+
"b",
|
| 1368 |
+
"f",
|
| 1369 |
+
"w",
|
| 1370 |
+
"z",
|
| 1371 |
+
],
|
| 1372 |
+
"Vietnamese": [
|
| 1373 |
+
"n",
|
| 1374 |
+
"h",
|
| 1375 |
+
"t",
|
| 1376 |
+
"i",
|
| 1377 |
+
"c",
|
| 1378 |
+
"g",
|
| 1379 |
+
"a",
|
| 1380 |
+
"o",
|
| 1381 |
+
"u",
|
| 1382 |
+
"m",
|
| 1383 |
+
"l",
|
| 1384 |
+
"r",
|
| 1385 |
+
"à",
|
| 1386 |
+
"đ",
|
| 1387 |
+
"s",
|
| 1388 |
+
"e",
|
| 1389 |
+
"v",
|
| 1390 |
+
"p",
|
| 1391 |
+
"b",
|
| 1392 |
+
"y",
|
| 1393 |
+
"ư",
|
| 1394 |
+
"d",
|
| 1395 |
+
"á",
|
| 1396 |
+
"k",
|
| 1397 |
+
"ộ",
|
| 1398 |
+
"ế",
|
| 1399 |
+
],
|
| 1400 |
+
"Czech": [
|
| 1401 |
+
"o",
|
| 1402 |
+
"e",
|
| 1403 |
+
"a",
|
| 1404 |
+
"n",
|
| 1405 |
+
"t",
|
| 1406 |
+
"s",
|
| 1407 |
+
"i",
|
| 1408 |
+
"l",
|
| 1409 |
+
"v",
|
| 1410 |
+
"r",
|
| 1411 |
+
"k",
|
| 1412 |
+
"d",
|
| 1413 |
+
"u",
|
| 1414 |
+
"m",
|
| 1415 |
+
"p",
|
| 1416 |
+
"í",
|
| 1417 |
+
"c",
|
| 1418 |
+
"h",
|
| 1419 |
+
"z",
|
| 1420 |
+
"á",
|
| 1421 |
+
"y",
|
| 1422 |
+
"j",
|
| 1423 |
+
"b",
|
| 1424 |
+
"ě",
|
| 1425 |
+
"é",
|
| 1426 |
+
"ř",
|
| 1427 |
+
],
|
| 1428 |
+
"Hungarian": [
|
| 1429 |
+
"e",
|
| 1430 |
+
"a",
|
| 1431 |
+
"t",
|
| 1432 |
+
"l",
|
| 1433 |
+
"s",
|
| 1434 |
+
"n",
|
| 1435 |
+
"k",
|
| 1436 |
+
"r",
|
| 1437 |
+
"i",
|
| 1438 |
+
"o",
|
| 1439 |
+
"z",
|
| 1440 |
+
"á",
|
| 1441 |
+
"é",
|
| 1442 |
+
"g",
|
| 1443 |
+
"m",
|
| 1444 |
+
"b",
|
| 1445 |
+
"y",
|
| 1446 |
+
"v",
|
| 1447 |
+
"d",
|
| 1448 |
+
"h",
|
| 1449 |
+
"u",
|
| 1450 |
+
"p",
|
| 1451 |
+
"j",
|
| 1452 |
+
"ö",
|
| 1453 |
+
"f",
|
| 1454 |
+
"c",
|
| 1455 |
+
],
|
| 1456 |
+
"Korean": [
|
| 1457 |
+
"이",
|
| 1458 |
+
"다",
|
| 1459 |
+
"에",
|
| 1460 |
+
"의",
|
| 1461 |
+
"는",
|
| 1462 |
+
"로",
|
| 1463 |
+
"하",
|
| 1464 |
+
"을",
|
| 1465 |
+
"가",
|
| 1466 |
+
"고",
|
| 1467 |
+
"지",
|
| 1468 |
+
"서",
|
| 1469 |
+
"한",
|
| 1470 |
+
"은",
|
| 1471 |
+
"기",
|
| 1472 |
+
"으",
|
| 1473 |
+
"년",
|
| 1474 |
+
"대",
|
| 1475 |
+
"사",
|
| 1476 |
+
"시",
|
| 1477 |
+
"를",
|
| 1478 |
+
"리",
|
| 1479 |
+
"도",
|
| 1480 |
+
"인",
|
| 1481 |
+
"스",
|
| 1482 |
+
"일",
|
| 1483 |
+
],
|
| 1484 |
+
"Indonesian": [
|
| 1485 |
+
"a",
|
| 1486 |
+
"n",
|
| 1487 |
+
"e",
|
| 1488 |
+
"i",
|
| 1489 |
+
"r",
|
| 1490 |
+
"t",
|
| 1491 |
+
"u",
|
| 1492 |
+
"s",
|
| 1493 |
+
"d",
|
| 1494 |
+
"k",
|
| 1495 |
+
"m",
|
| 1496 |
+
"l",
|
| 1497 |
+
"g",
|
| 1498 |
+
"p",
|
| 1499 |
+
"b",
|
| 1500 |
+
"o",
|
| 1501 |
+
"h",
|
| 1502 |
+
"y",
|
| 1503 |
+
"j",
|
| 1504 |
+
"c",
|
| 1505 |
+
"w",
|
| 1506 |
+
"f",
|
| 1507 |
+
"v",
|
| 1508 |
+
"z",
|
| 1509 |
+
"x",
|
| 1510 |
+
"q",
|
| 1511 |
+
],
|
| 1512 |
+
"Turkish": [
|
| 1513 |
+
"a",
|
| 1514 |
+
"e",
|
| 1515 |
+
"i",
|
| 1516 |
+
"n",
|
| 1517 |
+
"r",
|
| 1518 |
+
"l",
|
| 1519 |
+
"ı",
|
| 1520 |
+
"k",
|
| 1521 |
+
"d",
|
| 1522 |
+
"t",
|
| 1523 |
+
"s",
|
| 1524 |
+
"m",
|
| 1525 |
+
"y",
|
| 1526 |
+
"u",
|
| 1527 |
+
"o",
|
| 1528 |
+
"b",
|
| 1529 |
+
"ü",
|
| 1530 |
+
"ş",
|
| 1531 |
+
"v",
|
| 1532 |
+
"g",
|
| 1533 |
+
"z",
|
| 1534 |
+
"h",
|
| 1535 |
+
"c",
|
| 1536 |
+
"p",
|
| 1537 |
+
"ç",
|
| 1538 |
+
"ğ",
|
| 1539 |
+
],
|
| 1540 |
+
"Romanian": [
|
| 1541 |
+
"e",
|
| 1542 |
+
"i",
|
| 1543 |
+
"a",
|
| 1544 |
+
"r",
|
| 1545 |
+
"n",
|
| 1546 |
+
"t",
|
| 1547 |
+
"u",
|
| 1548 |
+
"l",
|
| 1549 |
+
"o",
|
| 1550 |
+
"c",
|
| 1551 |
+
"s",
|
| 1552 |
+
"d",
|
| 1553 |
+
"p",
|
| 1554 |
+
"m",
|
| 1555 |
+
"ă",
|
| 1556 |
+
"f",
|
| 1557 |
+
"v",
|
| 1558 |
+
"î",
|
| 1559 |
+
"g",
|
| 1560 |
+
"b",
|
| 1561 |
+
"ș",
|
| 1562 |
+
"ț",
|
| 1563 |
+
"z",
|
| 1564 |
+
"h",
|
| 1565 |
+
"â",
|
| 1566 |
+
"j",
|
| 1567 |
+
],
|
| 1568 |
+
"Farsi": [
|
| 1569 |
+
"ا",
|
| 1570 |
+
"ی",
|
| 1571 |
+
"ر",
|
| 1572 |
+
"د",
|
| 1573 |
+
"ن",
|
| 1574 |
+
"ه",
|
| 1575 |
+
"و",
|
| 1576 |
+
"م",
|
| 1577 |
+
"ت",
|
| 1578 |
+
"ب",
|
| 1579 |
+
"س",
|
| 1580 |
+
"ل",
|
| 1581 |
+
"ک",
|
| 1582 |
+
"ش",
|
| 1583 |
+
"ز",
|
| 1584 |
+
"ف",
|
| 1585 |
+
"گ",
|
| 1586 |
+
"ع",
|
| 1587 |
+
"خ",
|
| 1588 |
+
"ق",
|
| 1589 |
+
"ج",
|
| 1590 |
+
"آ",
|
| 1591 |
+
"پ",
|
| 1592 |
+
"ح",
|
| 1593 |
+
"ط",
|
| 1594 |
+
"ص",
|
| 1595 |
+
],
|
| 1596 |
+
"Arabic": [
|
| 1597 |
+
"ا",
|
| 1598 |
+
"ل",
|
| 1599 |
+
"ي",
|
| 1600 |
+
"م",
|
| 1601 |
+
"و",
|
| 1602 |
+
"ن",
|
| 1603 |
+
"ر",
|
| 1604 |
+
"ت",
|
| 1605 |
+
"ب",
|
| 1606 |
+
"ة",
|
| 1607 |
+
"ع",
|
| 1608 |
+
"د",
|
| 1609 |
+
"س",
|
| 1610 |
+
"ف",
|
| 1611 |
+
"ه",
|
| 1612 |
+
"ك",
|
| 1613 |
+
"ق",
|
| 1614 |
+
"أ",
|
| 1615 |
+
"ح",
|
| 1616 |
+
"ج",
|
| 1617 |
+
"ش",
|
| 1618 |
+
"ط",
|
| 1619 |
+
"ص",
|
| 1620 |
+
"ى",
|
| 1621 |
+
"خ",
|
| 1622 |
+
"إ",
|
| 1623 |
+
],
|
| 1624 |
+
"Danish": [
|
| 1625 |
+
"e",
|
| 1626 |
+
"r",
|
| 1627 |
+
"n",
|
| 1628 |
+
"t",
|
| 1629 |
+
"a",
|
| 1630 |
+
"i",
|
| 1631 |
+
"s",
|
| 1632 |
+
"d",
|
| 1633 |
+
"l",
|
| 1634 |
+
"o",
|
| 1635 |
+
"g",
|
| 1636 |
+
"m",
|
| 1637 |
+
"k",
|
| 1638 |
+
"f",
|
| 1639 |
+
"v",
|
| 1640 |
+
"u",
|
| 1641 |
+
"b",
|
| 1642 |
+
"h",
|
| 1643 |
+
"p",
|
| 1644 |
+
"å",
|
| 1645 |
+
"y",
|
| 1646 |
+
"ø",
|
| 1647 |
+
"æ",
|
| 1648 |
+
"c",
|
| 1649 |
+
"j",
|
| 1650 |
+
"w",
|
| 1651 |
+
],
|
| 1652 |
+
"Serbian": [
|
| 1653 |
+
"а",
|
| 1654 |
+
"и",
|
| 1655 |
+
"о",
|
| 1656 |
+
"е",
|
| 1657 |
+
"н",
|
| 1658 |
+
"р",
|
| 1659 |
+
"с",
|
| 1660 |
+
"у",
|
| 1661 |
+
"т",
|
| 1662 |
+
"к",
|
| 1663 |
+
"ј",
|
| 1664 |
+
"в",
|
| 1665 |
+
"д",
|
| 1666 |
+
"м",
|
| 1667 |
+
"п",
|
| 1668 |
+
"л",
|
| 1669 |
+
"г",
|
| 1670 |
+
"з",
|
| 1671 |
+
"б",
|
| 1672 |
+
"a",
|
| 1673 |
+
"i",
|
| 1674 |
+
"e",
|
| 1675 |
+
"o",
|
| 1676 |
+
"n",
|
| 1677 |
+
"ц",
|
| 1678 |
+
"ш",
|
| 1679 |
+
],
|
| 1680 |
+
"Lithuanian": [
|
| 1681 |
+
"i",
|
| 1682 |
+
"a",
|
| 1683 |
+
"s",
|
| 1684 |
+
"o",
|
| 1685 |
+
"r",
|
| 1686 |
+
"e",
|
| 1687 |
+
"t",
|
| 1688 |
+
"n",
|
| 1689 |
+
"u",
|
| 1690 |
+
"k",
|
| 1691 |
+
"m",
|
| 1692 |
+
"l",
|
| 1693 |
+
"p",
|
| 1694 |
+
"v",
|
| 1695 |
+
"d",
|
| 1696 |
+
"j",
|
| 1697 |
+
"g",
|
| 1698 |
+
"ė",
|
| 1699 |
+
"b",
|
| 1700 |
+
"y",
|
| 1701 |
+
"ų",
|
| 1702 |
+
"š",
|
| 1703 |
+
"ž",
|
| 1704 |
+
"c",
|
| 1705 |
+
"ą",
|
| 1706 |
+
"į",
|
| 1707 |
+
],
|
| 1708 |
+
"Slovene": [
|
| 1709 |
+
"e",
|
| 1710 |
+
"a",
|
| 1711 |
+
"i",
|
| 1712 |
+
"o",
|
| 1713 |
+
"n",
|
| 1714 |
+
"r",
|
| 1715 |
+
"s",
|
| 1716 |
+
"l",
|
| 1717 |
+
"t",
|
| 1718 |
+
"j",
|
| 1719 |
+
"v",
|
| 1720 |
+
"k",
|
| 1721 |
+
"d",
|
| 1722 |
+
"p",
|
| 1723 |
+
"m",
|
| 1724 |
+
"u",
|
| 1725 |
+
"z",
|
| 1726 |
+
"b",
|
| 1727 |
+
"g",
|
| 1728 |
+
"h",
|
| 1729 |
+
"č",
|
| 1730 |
+
"c",
|
| 1731 |
+
"š",
|
| 1732 |
+
"ž",
|
| 1733 |
+
"f",
|
| 1734 |
+
"y",
|
| 1735 |
+
],
|
| 1736 |
+
"Slovak": [
|
| 1737 |
+
"o",
|
| 1738 |
+
"a",
|
| 1739 |
+
"e",
|
| 1740 |
+
"n",
|
| 1741 |
+
"i",
|
| 1742 |
+
"r",
|
| 1743 |
+
"v",
|
| 1744 |
+
"t",
|
| 1745 |
+
"s",
|
| 1746 |
+
"l",
|
| 1747 |
+
"k",
|
| 1748 |
+
"d",
|
| 1749 |
+
"m",
|
| 1750 |
+
"p",
|
| 1751 |
+
"u",
|
| 1752 |
+
"c",
|
| 1753 |
+
"h",
|
| 1754 |
+
"j",
|
| 1755 |
+
"b",
|
| 1756 |
+
"z",
|
| 1757 |
+
"á",
|
| 1758 |
+
"y",
|
| 1759 |
+
"ý",
|
| 1760 |
+
"í",
|
| 1761 |
+
"č",
|
| 1762 |
+
"é",
|
| 1763 |
+
],
|
| 1764 |
+
"Hebrew": [
|
| 1765 |
+
"י",
|
| 1766 |
+
"ו",
|
| 1767 |
+
"ה",
|
| 1768 |
+
"ל",
|
| 1769 |
+
"ר",
|
| 1770 |
+
"ב",
|
| 1771 |
+
"ת",
|
| 1772 |
+
"מ",
|
| 1773 |
+
"א",
|
| 1774 |
+
"ש",
|
| 1775 |
+
"נ",
|
| 1776 |
+
"ע",
|
| 1777 |
+
"ם",
|
| 1778 |
+
"ד",
|
| 1779 |
+
"ק",
|
| 1780 |
+
"ח",
|
| 1781 |
+
"פ",
|
| 1782 |
+
"ס",
|
| 1783 |
+
"כ",
|
| 1784 |
+
"ג",
|
| 1785 |
+
"ט",
|
| 1786 |
+
"צ",
|
| 1787 |
+
"ן",
|
| 1788 |
+
"ז",
|
| 1789 |
+
"ך",
|
| 1790 |
+
],
|
| 1791 |
+
"Bulgarian": [
|
| 1792 |
+
"а",
|
| 1793 |
+
"и",
|
| 1794 |
+
"о",
|
| 1795 |
+
"е",
|
| 1796 |
+
"н",
|
| 1797 |
+
"т",
|
| 1798 |
+
"р",
|
| 1799 |
+
"с",
|
| 1800 |
+
"в",
|
| 1801 |
+
"л",
|
| 1802 |
+
"к",
|
| 1803 |
+
"д",
|
| 1804 |
+
"п",
|
| 1805 |
+
"м",
|
| 1806 |
+
"з",
|
| 1807 |
+
"г",
|
| 1808 |
+
"я",
|
| 1809 |
+
"ъ",
|
| 1810 |
+
"у",
|
| 1811 |
+
"б",
|
| 1812 |
+
"ч",
|
| 1813 |
+
"ц",
|
| 1814 |
+
"й",
|
| 1815 |
+
"ж",
|
| 1816 |
+
"щ",
|
| 1817 |
+
"х",
|
| 1818 |
+
],
|
| 1819 |
+
"Croatian": [
|
| 1820 |
+
"a",
|
| 1821 |
+
"i",
|
| 1822 |
+
"o",
|
| 1823 |
+
"e",
|
| 1824 |
+
"n",
|
| 1825 |
+
"r",
|
| 1826 |
+
"j",
|
| 1827 |
+
"s",
|
| 1828 |
+
"t",
|
| 1829 |
+
"u",
|
| 1830 |
+
"k",
|
| 1831 |
+
"l",
|
| 1832 |
+
"v",
|
| 1833 |
+
"d",
|
| 1834 |
+
"m",
|
| 1835 |
+
"p",
|
| 1836 |
+
"g",
|
| 1837 |
+
"z",
|
| 1838 |
+
"b",
|
| 1839 |
+
"c",
|
| 1840 |
+
"č",
|
| 1841 |
+
"h",
|
| 1842 |
+
"š",
|
| 1843 |
+
"ž",
|
| 1844 |
+
"ć",
|
| 1845 |
+
"f",
|
| 1846 |
+
],
|
| 1847 |
+
"Hindi": [
|
| 1848 |
+
"क",
|
| 1849 |
+
"र",
|
| 1850 |
+
"स",
|
| 1851 |
+
"न",
|
| 1852 |
+
"त",
|
| 1853 |
+
"म",
|
| 1854 |
+
"ह",
|
| 1855 |
+
"प",
|
| 1856 |
+
"य",
|
| 1857 |
+
"ल",
|
| 1858 |
+
"व",
|
| 1859 |
+
"ज",
|
| 1860 |
+
"द",
|
| 1861 |
+
"ग",
|
| 1862 |
+
"ब",
|
| 1863 |
+
"श",
|
| 1864 |
+
"ट",
|
| 1865 |
+
"अ",
|
| 1866 |
+
"ए",
|
| 1867 |
+
"थ",
|
| 1868 |
+
"भ",
|
| 1869 |
+
"ड",
|
| 1870 |
+
"च",
|
| 1871 |
+
"ध",
|
| 1872 |
+
"ष",
|
| 1873 |
+
"इ",
|
| 1874 |
+
],
|
| 1875 |
+
"Estonian": [
|
| 1876 |
+
"a",
|
| 1877 |
+
"i",
|
| 1878 |
+
"e",
|
| 1879 |
+
"s",
|
| 1880 |
+
"t",
|
| 1881 |
+
"l",
|
| 1882 |
+
"u",
|
| 1883 |
+
"n",
|
| 1884 |
+
"o",
|
| 1885 |
+
"k",
|
| 1886 |
+
"r",
|
| 1887 |
+
"d",
|
| 1888 |
+
"m",
|
| 1889 |
+
"v",
|
| 1890 |
+
"g",
|
| 1891 |
+
"p",
|
| 1892 |
+
"j",
|
| 1893 |
+
"h",
|
| 1894 |
+
"ä",
|
| 1895 |
+
"b",
|
| 1896 |
+
"õ",
|
| 1897 |
+
"ü",
|
| 1898 |
+
"f",
|
| 1899 |
+
"c",
|
| 1900 |
+
"ö",
|
| 1901 |
+
"y",
|
| 1902 |
+
],
|
| 1903 |
+
"Thai": [
|
| 1904 |
+
"า",
|
| 1905 |
+
"น",
|
| 1906 |
+
"ร",
|
| 1907 |
+
"อ",
|
| 1908 |
+
"ก",
|
| 1909 |
+
"เ",
|
| 1910 |
+
"ง",
|
| 1911 |
+
"ม",
|
| 1912 |
+
"ย",
|
| 1913 |
+
"ล",
|
| 1914 |
+
"ว",
|
| 1915 |
+
"ด",
|
| 1916 |
+
"ท",
|
| 1917 |
+
"ส",
|
| 1918 |
+
"ต",
|
| 1919 |
+
"ะ",
|
| 1920 |
+
"ป",
|
| 1921 |
+
"บ",
|
| 1922 |
+
"ค",
|
| 1923 |
+
"ห",
|
| 1924 |
+
"แ",
|
| 1925 |
+
"จ",
|
| 1926 |
+
"พ",
|
| 1927 |
+
"ช",
|
| 1928 |
+
"ข",
|
| 1929 |
+
"ใ",
|
| 1930 |
+
],
|
| 1931 |
+
"Greek": [
|
| 1932 |
+
"α",
|
| 1933 |
+
"τ",
|
| 1934 |
+
"ο",
|
| 1935 |
+
"ι",
|
| 1936 |
+
"ε",
|
| 1937 |
+
"ν",
|
| 1938 |
+
"ρ",
|
| 1939 |
+
"σ",
|
| 1940 |
+
"κ",
|
| 1941 |
+
"η",
|
| 1942 |
+
"π",
|
| 1943 |
+
"ς",
|
| 1944 |
+
"υ",
|
| 1945 |
+
"μ",
|
| 1946 |
+
"λ",
|
| 1947 |
+
"ί",
|
| 1948 |
+
"ό",
|
| 1949 |
+
"ά",
|
| 1950 |
+
"γ",
|
| 1951 |
+
"έ",
|
| 1952 |
+
"δ",
|
| 1953 |
+
"ή",
|
| 1954 |
+
"ω",
|
| 1955 |
+
"χ",
|
| 1956 |
+
"θ",
|
| 1957 |
+
"ύ",
|
| 1958 |
+
],
|
| 1959 |
+
"Tamil": [
|
| 1960 |
+
"க",
|
| 1961 |
+
"த",
|
| 1962 |
+
"ப",
|
| 1963 |
+
"ட",
|
| 1964 |
+
"ர",
|
| 1965 |
+
"ம",
|
| 1966 |
+
"ல",
|
| 1967 |
+
"ன",
|
| 1968 |
+
"வ",
|
| 1969 |
+
"ற",
|
| 1970 |
+
"ய",
|
| 1971 |
+
"ள",
|
| 1972 |
+
"ச",
|
| 1973 |
+
"ந",
|
| 1974 |
+
"இ",
|
| 1975 |
+
"ண",
|
| 1976 |
+
"அ",
|
| 1977 |
+
"ஆ",
|
| 1978 |
+
"ழ",
|
| 1979 |
+
"ங",
|
| 1980 |
+
"எ",
|
| 1981 |
+
"உ",
|
| 1982 |
+
"ஒ",
|
| 1983 |
+
"ஸ",
|
| 1984 |
+
],
|
| 1985 |
+
"Kazakh": [
|
| 1986 |
+
"а",
|
| 1987 |
+
"ы",
|
| 1988 |
+
"е",
|
| 1989 |
+
"н",
|
| 1990 |
+
"т",
|
| 1991 |
+
"р",
|
| 1992 |
+
"л",
|
| 1993 |
+
"і",
|
| 1994 |
+
"д",
|
| 1995 |
+
"с",
|
| 1996 |
+
"м",
|
| 1997 |
+
"қ",
|
| 1998 |
+
"к",
|
| 1999 |
+
"о",
|
| 2000 |
+
"б",
|
| 2001 |
+
"и",
|
| 2002 |
+
"у",
|
| 2003 |
+
"ғ",
|
| 2004 |
+
"ж",
|
| 2005 |
+
"ң",
|
| 2006 |
+
"з",
|
| 2007 |
+
"ш",
|
| 2008 |
+
"й",
|
| 2009 |
+
"п",
|
| 2010 |
+
"г",
|
| 2011 |
+
"ө",
|
| 2012 |
+
],
|
| 2013 |
+
}
|
| 2014 |
+
|
| 2015 |
+
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
venv/lib/python3.13/site-packages/charset_normalizer/legacy.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import TYPE_CHECKING, Any
|
| 4 |
+
from warnings import warn
|
| 5 |
+
|
| 6 |
+
from .api import from_bytes
|
| 7 |
+
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
|
| 8 |
+
|
| 9 |
+
# TODO: remove this check when dropping Python 3.7 support
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from typing_extensions import TypedDict
|
| 12 |
+
|
| 13 |
+
class ResultDict(TypedDict):
|
| 14 |
+
encoding: str | None
|
| 15 |
+
language: str
|
| 16 |
+
confidence: float | None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def detect(
|
| 20 |
+
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
| 21 |
+
) -> ResultDict:
|
| 22 |
+
"""
|
| 23 |
+
chardet legacy method
|
| 24 |
+
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
| 25 |
+
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
| 26 |
+
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
| 27 |
+
further information. Not planned for removal.
|
| 28 |
+
|
| 29 |
+
:param byte_str: The byte sequence to examine.
|
| 30 |
+
:param should_rename_legacy: Should we rename legacy encodings
|
| 31 |
+
to their more modern equivalents?
|
| 32 |
+
"""
|
| 33 |
+
if len(kwargs):
|
| 34 |
+
warn(
|
| 35 |
+
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if not isinstance(byte_str, (bytearray, bytes)):
|
| 39 |
+
raise TypeError( # pragma: nocover
|
| 40 |
+
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if isinstance(byte_str, bytearray):
|
| 44 |
+
byte_str = bytes(byte_str)
|
| 45 |
+
|
| 46 |
+
r = from_bytes(byte_str).best()
|
| 47 |
+
|
| 48 |
+
encoding = r.encoding if r is not None else None
|
| 49 |
+
language = r.language if r is not None and r.language != "Unknown" else ""
|
| 50 |
+
confidence = 1.0 - r.chaos if r is not None else None
|
| 51 |
+
|
| 52 |
+
# automatically lower confidence
|
| 53 |
+
# on small bytes samples.
|
| 54 |
+
# https://github.com/jawah/charset_normalizer/issues/391
|
| 55 |
+
if (
|
| 56 |
+
confidence is not None
|
| 57 |
+
and confidence >= 0.9
|
| 58 |
+
and encoding
|
| 59 |
+
not in {
|
| 60 |
+
"utf_8",
|
| 61 |
+
"ascii",
|
| 62 |
+
}
|
| 63 |
+
and r.bom is False # type: ignore[union-attr]
|
| 64 |
+
and len(byte_str) < TOO_SMALL_SEQUENCE
|
| 65 |
+
):
|
| 66 |
+
confidence -= 0.2
|
| 67 |
+
|
| 68 |
+
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
| 69 |
+
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
| 70 |
+
if r is not None and encoding == "utf_8" and r.bom:
|
| 71 |
+
encoding += "_sig"
|
| 72 |
+
|
| 73 |
+
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
| 74 |
+
encoding = CHARDET_CORRESPONDENCE[encoding]
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"encoding": encoding,
|
| 78 |
+
"language": language,
|
| 79 |
+
"confidence": confidence,
|
| 80 |
+
}
|
venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-x86_64-linux-gnu.so
ADDED
|
Binary file (15.9 kB). View file
|
|
|
venv/lib/python3.13/site-packages/charset_normalizer/md.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from logging import getLogger
|
| 5 |
+
|
| 6 |
+
from .constant import (
|
| 7 |
+
COMMON_SAFE_ASCII_CHARACTERS,
|
| 8 |
+
TRACE,
|
| 9 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
| 10 |
+
)
|
| 11 |
+
from .utils import (
|
| 12 |
+
is_accentuated,
|
| 13 |
+
is_arabic,
|
| 14 |
+
is_arabic_isolated_form,
|
| 15 |
+
is_case_variable,
|
| 16 |
+
is_cjk,
|
| 17 |
+
is_emoticon,
|
| 18 |
+
is_hangul,
|
| 19 |
+
is_hiragana,
|
| 20 |
+
is_katakana,
|
| 21 |
+
is_latin,
|
| 22 |
+
is_punctuation,
|
| 23 |
+
is_separator,
|
| 24 |
+
is_symbol,
|
| 25 |
+
is_thai,
|
| 26 |
+
is_unprintable,
|
| 27 |
+
remove_accent,
|
| 28 |
+
unicode_range,
|
| 29 |
+
is_cjk_uncommon,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class MessDetectorPlugin:
|
| 34 |
+
"""
|
| 35 |
+
Base abstract class used for mess detection plugins.
|
| 36 |
+
All detectors MUST extend and implement given methods.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def eligible(self, character: str) -> bool:
|
| 40 |
+
"""
|
| 41 |
+
Determine if given character should be fed in.
|
| 42 |
+
"""
|
| 43 |
+
raise NotImplementedError # pragma: nocover
|
| 44 |
+
|
| 45 |
+
def feed(self, character: str) -> None:
|
| 46 |
+
"""
|
| 47 |
+
The main routine to be executed upon character.
|
| 48 |
+
Insert the logic in witch the text would be considered chaotic.
|
| 49 |
+
"""
|
| 50 |
+
raise NotImplementedError # pragma: nocover
|
| 51 |
+
|
| 52 |
+
def reset(self) -> None: # pragma: no cover
|
| 53 |
+
"""
|
| 54 |
+
Permit to reset the plugin to the initial state.
|
| 55 |
+
"""
|
| 56 |
+
raise NotImplementedError
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def ratio(self) -> float:
|
| 60 |
+
"""
|
| 61 |
+
Compute the chaos ratio based on what your feed() has seen.
|
| 62 |
+
Must NOT be lower than 0.; No restriction gt 0.
|
| 63 |
+
"""
|
| 64 |
+
raise NotImplementedError # pragma: nocover
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
| 68 |
+
def __init__(self) -> None:
|
| 69 |
+
self._punctuation_count: int = 0
|
| 70 |
+
self._symbol_count: int = 0
|
| 71 |
+
self._character_count: int = 0
|
| 72 |
+
|
| 73 |
+
self._last_printable_char: str | None = None
|
| 74 |
+
self._frenzy_symbol_in_word: bool = False
|
| 75 |
+
|
| 76 |
+
def eligible(self, character: str) -> bool:
|
| 77 |
+
return character.isprintable()
|
| 78 |
+
|
| 79 |
+
def feed(self, character: str) -> None:
|
| 80 |
+
self._character_count += 1
|
| 81 |
+
|
| 82 |
+
if (
|
| 83 |
+
character != self._last_printable_char
|
| 84 |
+
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
| 85 |
+
):
|
| 86 |
+
if is_punctuation(character):
|
| 87 |
+
self._punctuation_count += 1
|
| 88 |
+
elif (
|
| 89 |
+
character.isdigit() is False
|
| 90 |
+
and is_symbol(character)
|
| 91 |
+
and is_emoticon(character) is False
|
| 92 |
+
):
|
| 93 |
+
self._symbol_count += 2
|
| 94 |
+
|
| 95 |
+
self._last_printable_char = character
|
| 96 |
+
|
| 97 |
+
def reset(self) -> None: # Abstract
|
| 98 |
+
self._punctuation_count = 0
|
| 99 |
+
self._character_count = 0
|
| 100 |
+
self._symbol_count = 0
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def ratio(self) -> float:
|
| 104 |
+
if self._character_count == 0:
|
| 105 |
+
return 0.0
|
| 106 |
+
|
| 107 |
+
ratio_of_punctuation: float = (
|
| 108 |
+
self._punctuation_count + self._symbol_count
|
| 109 |
+
) / self._character_count
|
| 110 |
+
|
| 111 |
+
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
| 115 |
+
def __init__(self) -> None:
|
| 116 |
+
self._character_count: int = 0
|
| 117 |
+
self._accentuated_count: int = 0
|
| 118 |
+
|
| 119 |
+
def eligible(self, character: str) -> bool:
|
| 120 |
+
return character.isalpha()
|
| 121 |
+
|
| 122 |
+
def feed(self, character: str) -> None:
|
| 123 |
+
self._character_count += 1
|
| 124 |
+
|
| 125 |
+
if is_accentuated(character):
|
| 126 |
+
self._accentuated_count += 1
|
| 127 |
+
|
| 128 |
+
def reset(self) -> None: # Abstract
|
| 129 |
+
self._character_count = 0
|
| 130 |
+
self._accentuated_count = 0
|
| 131 |
+
|
| 132 |
+
@property
|
| 133 |
+
def ratio(self) -> float:
|
| 134 |
+
if self._character_count < 8:
|
| 135 |
+
return 0.0
|
| 136 |
+
|
| 137 |
+
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
| 138 |
+
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class UnprintablePlugin(MessDetectorPlugin):
|
| 142 |
+
def __init__(self) -> None:
|
| 143 |
+
self._unprintable_count: int = 0
|
| 144 |
+
self._character_count: int = 0
|
| 145 |
+
|
| 146 |
+
def eligible(self, character: str) -> bool:
|
| 147 |
+
return True
|
| 148 |
+
|
| 149 |
+
def feed(self, character: str) -> None:
|
| 150 |
+
if is_unprintable(character):
|
| 151 |
+
self._unprintable_count += 1
|
| 152 |
+
self._character_count += 1
|
| 153 |
+
|
| 154 |
+
def reset(self) -> None: # Abstract
|
| 155 |
+
self._unprintable_count = 0
|
| 156 |
+
|
| 157 |
+
@property
|
| 158 |
+
def ratio(self) -> float:
|
| 159 |
+
if self._character_count == 0:
|
| 160 |
+
return 0.0
|
| 161 |
+
|
| 162 |
+
return (self._unprintable_count * 8) / self._character_count
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
| 166 |
+
def __init__(self) -> None:
|
| 167 |
+
self._successive_count: int = 0
|
| 168 |
+
self._character_count: int = 0
|
| 169 |
+
|
| 170 |
+
self._last_latin_character: str | None = None
|
| 171 |
+
|
| 172 |
+
def eligible(self, character: str) -> bool:
|
| 173 |
+
return character.isalpha() and is_latin(character)
|
| 174 |
+
|
| 175 |
+
def feed(self, character: str) -> None:
|
| 176 |
+
self._character_count += 1
|
| 177 |
+
if (
|
| 178 |
+
self._last_latin_character is not None
|
| 179 |
+
and is_accentuated(character)
|
| 180 |
+
and is_accentuated(self._last_latin_character)
|
| 181 |
+
):
|
| 182 |
+
if character.isupper() and self._last_latin_character.isupper():
|
| 183 |
+
self._successive_count += 1
|
| 184 |
+
# Worse if its the same char duplicated with different accent.
|
| 185 |
+
if remove_accent(character) == remove_accent(self._last_latin_character):
|
| 186 |
+
self._successive_count += 1
|
| 187 |
+
self._last_latin_character = character
|
| 188 |
+
|
| 189 |
+
def reset(self) -> None: # Abstract
|
| 190 |
+
self._successive_count = 0
|
| 191 |
+
self._character_count = 0
|
| 192 |
+
self._last_latin_character = None
|
| 193 |
+
|
| 194 |
+
@property
|
| 195 |
+
def ratio(self) -> float:
|
| 196 |
+
if self._character_count == 0:
|
| 197 |
+
return 0.0
|
| 198 |
+
|
| 199 |
+
return (self._successive_count * 2) / self._character_count
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
class SuspiciousRange(MessDetectorPlugin):
|
| 203 |
+
def __init__(self) -> None:
|
| 204 |
+
self._suspicious_successive_range_count: int = 0
|
| 205 |
+
self._character_count: int = 0
|
| 206 |
+
self._last_printable_seen: str | None = None
|
| 207 |
+
|
| 208 |
+
def eligible(self, character: str) -> bool:
|
| 209 |
+
return character.isprintable()
|
| 210 |
+
|
| 211 |
+
def feed(self, character: str) -> None:
|
| 212 |
+
self._character_count += 1
|
| 213 |
+
|
| 214 |
+
if (
|
| 215 |
+
character.isspace()
|
| 216 |
+
or is_punctuation(character)
|
| 217 |
+
or character in COMMON_SAFE_ASCII_CHARACTERS
|
| 218 |
+
):
|
| 219 |
+
self._last_printable_seen = None
|
| 220 |
+
return
|
| 221 |
+
|
| 222 |
+
if self._last_printable_seen is None:
|
| 223 |
+
self._last_printable_seen = character
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
| 227 |
+
unicode_range_b: str | None = unicode_range(character)
|
| 228 |
+
|
| 229 |
+
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
| 230 |
+
self._suspicious_successive_range_count += 1
|
| 231 |
+
|
| 232 |
+
self._last_printable_seen = character
|
| 233 |
+
|
| 234 |
+
def reset(self) -> None: # Abstract
|
| 235 |
+
self._character_count = 0
|
| 236 |
+
self._suspicious_successive_range_count = 0
|
| 237 |
+
self._last_printable_seen = None
|
| 238 |
+
|
| 239 |
+
@property
|
| 240 |
+
def ratio(self) -> float:
|
| 241 |
+
if self._character_count <= 13:
|
| 242 |
+
return 0.0
|
| 243 |
+
|
| 244 |
+
ratio_of_suspicious_range_usage: float = (
|
| 245 |
+
self._suspicious_successive_range_count * 2
|
| 246 |
+
) / self._character_count
|
| 247 |
+
|
| 248 |
+
return ratio_of_suspicious_range_usage
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
| 252 |
+
def __init__(self) -> None:
|
| 253 |
+
self._word_count: int = 0
|
| 254 |
+
self._bad_word_count: int = 0
|
| 255 |
+
self._foreign_long_count: int = 0
|
| 256 |
+
|
| 257 |
+
self._is_current_word_bad: bool = False
|
| 258 |
+
self._foreign_long_watch: bool = False
|
| 259 |
+
|
| 260 |
+
self._character_count: int = 0
|
| 261 |
+
self._bad_character_count: int = 0
|
| 262 |
+
|
| 263 |
+
self._buffer: str = ""
|
| 264 |
+
self._buffer_accent_count: int = 0
|
| 265 |
+
self._buffer_glyph_count: int = 0
|
| 266 |
+
|
| 267 |
+
def eligible(self, character: str) -> bool:
|
| 268 |
+
return True
|
| 269 |
+
|
| 270 |
+
def feed(self, character: str) -> None:
|
| 271 |
+
if character.isalpha():
|
| 272 |
+
self._buffer += character
|
| 273 |
+
if is_accentuated(character):
|
| 274 |
+
self._buffer_accent_count += 1
|
| 275 |
+
if (
|
| 276 |
+
self._foreign_long_watch is False
|
| 277 |
+
and (is_latin(character) is False or is_accentuated(character))
|
| 278 |
+
and is_cjk(character) is False
|
| 279 |
+
and is_hangul(character) is False
|
| 280 |
+
and is_katakana(character) is False
|
| 281 |
+
and is_hiragana(character) is False
|
| 282 |
+
and is_thai(character) is False
|
| 283 |
+
):
|
| 284 |
+
self._foreign_long_watch = True
|
| 285 |
+
if (
|
| 286 |
+
is_cjk(character)
|
| 287 |
+
or is_hangul(character)
|
| 288 |
+
or is_katakana(character)
|
| 289 |
+
or is_hiragana(character)
|
| 290 |
+
or is_thai(character)
|
| 291 |
+
):
|
| 292 |
+
self._buffer_glyph_count += 1
|
| 293 |
+
return
|
| 294 |
+
if not self._buffer:
|
| 295 |
+
return
|
| 296 |
+
if (
|
| 297 |
+
character.isspace() or is_punctuation(character) or is_separator(character)
|
| 298 |
+
) and self._buffer:
|
| 299 |
+
self._word_count += 1
|
| 300 |
+
buffer_length: int = len(self._buffer)
|
| 301 |
+
|
| 302 |
+
self._character_count += buffer_length
|
| 303 |
+
|
| 304 |
+
if buffer_length >= 4:
|
| 305 |
+
if self._buffer_accent_count / buffer_length >= 0.5:
|
| 306 |
+
self._is_current_word_bad = True
|
| 307 |
+
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
| 308 |
+
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
| 309 |
+
elif (
|
| 310 |
+
is_accentuated(self._buffer[-1])
|
| 311 |
+
and self._buffer[-1].isupper()
|
| 312 |
+
and all(_.isupper() for _ in self._buffer) is False
|
| 313 |
+
):
|
| 314 |
+
self._foreign_long_count += 1
|
| 315 |
+
self._is_current_word_bad = True
|
| 316 |
+
elif self._buffer_glyph_count == 1:
|
| 317 |
+
self._is_current_word_bad = True
|
| 318 |
+
self._foreign_long_count += 1
|
| 319 |
+
if buffer_length >= 24 and self._foreign_long_watch:
|
| 320 |
+
camel_case_dst = [
|
| 321 |
+
i
|
| 322 |
+
for c, i in zip(self._buffer, range(0, buffer_length))
|
| 323 |
+
if c.isupper()
|
| 324 |
+
]
|
| 325 |
+
probable_camel_cased: bool = False
|
| 326 |
+
|
| 327 |
+
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
| 328 |
+
probable_camel_cased = True
|
| 329 |
+
|
| 330 |
+
if not probable_camel_cased:
|
| 331 |
+
self._foreign_long_count += 1
|
| 332 |
+
self._is_current_word_bad = True
|
| 333 |
+
|
| 334 |
+
if self._is_current_word_bad:
|
| 335 |
+
self._bad_word_count += 1
|
| 336 |
+
self._bad_character_count += len(self._buffer)
|
| 337 |
+
self._is_current_word_bad = False
|
| 338 |
+
|
| 339 |
+
self._foreign_long_watch = False
|
| 340 |
+
self._buffer = ""
|
| 341 |
+
self._buffer_accent_count = 0
|
| 342 |
+
self._buffer_glyph_count = 0
|
| 343 |
+
elif (
|
| 344 |
+
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
| 345 |
+
and character.isdigit() is False
|
| 346 |
+
and is_symbol(character)
|
| 347 |
+
):
|
| 348 |
+
self._is_current_word_bad = True
|
| 349 |
+
self._buffer += character
|
| 350 |
+
|
| 351 |
+
def reset(self) -> None: # Abstract
|
| 352 |
+
self._buffer = ""
|
| 353 |
+
self._is_current_word_bad = False
|
| 354 |
+
self._foreign_long_watch = False
|
| 355 |
+
self._bad_word_count = 0
|
| 356 |
+
self._word_count = 0
|
| 357 |
+
self._character_count = 0
|
| 358 |
+
self._bad_character_count = 0
|
| 359 |
+
self._foreign_long_count = 0
|
| 360 |
+
|
| 361 |
+
@property
|
| 362 |
+
def ratio(self) -> float:
|
| 363 |
+
if self._word_count <= 10 and self._foreign_long_count == 0:
|
| 364 |
+
return 0.0
|
| 365 |
+
|
| 366 |
+
return self._bad_character_count / self._character_count
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
class CjkUncommonPlugin(MessDetectorPlugin):
|
| 370 |
+
"""
|
| 371 |
+
Detect messy CJK text that probably means nothing.
|
| 372 |
+
"""
|
| 373 |
+
|
| 374 |
+
def __init__(self) -> None:
|
| 375 |
+
self._character_count: int = 0
|
| 376 |
+
self._uncommon_count: int = 0
|
| 377 |
+
|
| 378 |
+
def eligible(self, character: str) -> bool:
|
| 379 |
+
return is_cjk(character)
|
| 380 |
+
|
| 381 |
+
def feed(self, character: str) -> None:
|
| 382 |
+
self._character_count += 1
|
| 383 |
+
|
| 384 |
+
if is_cjk_uncommon(character):
|
| 385 |
+
self._uncommon_count += 1
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
def reset(self) -> None: # Abstract
|
| 389 |
+
self._character_count = 0
|
| 390 |
+
self._uncommon_count = 0
|
| 391 |
+
|
| 392 |
+
@property
|
| 393 |
+
def ratio(self) -> float:
|
| 394 |
+
if self._character_count < 8:
|
| 395 |
+
return 0.0
|
| 396 |
+
|
| 397 |
+
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
| 398 |
+
|
| 399 |
+
# we can be pretty sure it's garbage when uncommon characters are widely
|
| 400 |
+
# used. otherwise it could just be traditional chinese for example.
|
| 401 |
+
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
| 405 |
+
def __init__(self) -> None:
|
| 406 |
+
self._buf: bool = False
|
| 407 |
+
|
| 408 |
+
self._character_count_since_last_sep: int = 0
|
| 409 |
+
|
| 410 |
+
self._successive_upper_lower_count: int = 0
|
| 411 |
+
self._successive_upper_lower_count_final: int = 0
|
| 412 |
+
|
| 413 |
+
self._character_count: int = 0
|
| 414 |
+
|
| 415 |
+
self._last_alpha_seen: str | None = None
|
| 416 |
+
self._current_ascii_only: bool = True
|
| 417 |
+
|
| 418 |
+
def eligible(self, character: str) -> bool:
|
| 419 |
+
return True
|
| 420 |
+
|
| 421 |
+
def feed(self, character: str) -> None:
|
| 422 |
+
is_concerned = character.isalpha() and is_case_variable(character)
|
| 423 |
+
chunk_sep = is_concerned is False
|
| 424 |
+
|
| 425 |
+
if chunk_sep and self._character_count_since_last_sep > 0:
|
| 426 |
+
if (
|
| 427 |
+
self._character_count_since_last_sep <= 64
|
| 428 |
+
and character.isdigit() is False
|
| 429 |
+
and self._current_ascii_only is False
|
| 430 |
+
):
|
| 431 |
+
self._successive_upper_lower_count_final += (
|
| 432 |
+
self._successive_upper_lower_count
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
self._successive_upper_lower_count = 0
|
| 436 |
+
self._character_count_since_last_sep = 0
|
| 437 |
+
self._last_alpha_seen = None
|
| 438 |
+
self._buf = False
|
| 439 |
+
self._character_count += 1
|
| 440 |
+
self._current_ascii_only = True
|
| 441 |
+
|
| 442 |
+
return
|
| 443 |
+
|
| 444 |
+
if self._current_ascii_only is True and character.isascii() is False:
|
| 445 |
+
self._current_ascii_only = False
|
| 446 |
+
|
| 447 |
+
if self._last_alpha_seen is not None:
|
| 448 |
+
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
| 449 |
+
character.islower() and self._last_alpha_seen.isupper()
|
| 450 |
+
):
|
| 451 |
+
if self._buf is True:
|
| 452 |
+
self._successive_upper_lower_count += 2
|
| 453 |
+
self._buf = False
|
| 454 |
+
else:
|
| 455 |
+
self._buf = True
|
| 456 |
+
else:
|
| 457 |
+
self._buf = False
|
| 458 |
+
|
| 459 |
+
self._character_count += 1
|
| 460 |
+
self._character_count_since_last_sep += 1
|
| 461 |
+
self._last_alpha_seen = character
|
| 462 |
+
|
| 463 |
+
def reset(self) -> None: # Abstract
|
| 464 |
+
self._character_count = 0
|
| 465 |
+
self._character_count_since_last_sep = 0
|
| 466 |
+
self._successive_upper_lower_count = 0
|
| 467 |
+
self._successive_upper_lower_count_final = 0
|
| 468 |
+
self._last_alpha_seen = None
|
| 469 |
+
self._buf = False
|
| 470 |
+
self._current_ascii_only = True
|
| 471 |
+
|
| 472 |
+
@property
|
| 473 |
+
def ratio(self) -> float:
|
| 474 |
+
if self._character_count == 0:
|
| 475 |
+
return 0.0
|
| 476 |
+
|
| 477 |
+
return self._successive_upper_lower_count_final / self._character_count
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
| 481 |
+
def __init__(self) -> None:
|
| 482 |
+
self._character_count: int = 0
|
| 483 |
+
self._isolated_form_count: int = 0
|
| 484 |
+
|
| 485 |
+
def reset(self) -> None: # Abstract
|
| 486 |
+
self._character_count = 0
|
| 487 |
+
self._isolated_form_count = 0
|
| 488 |
+
|
| 489 |
+
def eligible(self, character: str) -> bool:
|
| 490 |
+
return is_arabic(character)
|
| 491 |
+
|
| 492 |
+
def feed(self, character: str) -> None:
|
| 493 |
+
self._character_count += 1
|
| 494 |
+
|
| 495 |
+
if is_arabic_isolated_form(character):
|
| 496 |
+
self._isolated_form_count += 1
|
| 497 |
+
|
| 498 |
+
@property
|
| 499 |
+
def ratio(self) -> float:
|
| 500 |
+
if self._character_count < 8:
|
| 501 |
+
return 0.0
|
| 502 |
+
|
| 503 |
+
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
| 504 |
+
|
| 505 |
+
return isolated_form_usage
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
@lru_cache(maxsize=1024)
|
| 509 |
+
def is_suspiciously_successive_range(
|
| 510 |
+
unicode_range_a: str | None, unicode_range_b: str | None
|
| 511 |
+
) -> bool:
|
| 512 |
+
"""
|
| 513 |
+
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
| 514 |
+
"""
|
| 515 |
+
if unicode_range_a is None or unicode_range_b is None:
|
| 516 |
+
return True
|
| 517 |
+
|
| 518 |
+
if unicode_range_a == unicode_range_b:
|
| 519 |
+
return False
|
| 520 |
+
|
| 521 |
+
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
| 522 |
+
return False
|
| 523 |
+
|
| 524 |
+
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
| 525 |
+
return False
|
| 526 |
+
|
| 527 |
+
# Latin characters can be accompanied with a combining diacritical mark
|
| 528 |
+
# eg. Vietnamese.
|
| 529 |
+
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
| 530 |
+
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
| 531 |
+
):
|
| 532 |
+
return False
|
| 533 |
+
|
| 534 |
+
keywords_range_a, keywords_range_b = (
|
| 535 |
+
unicode_range_a.split(" "),
|
| 536 |
+
unicode_range_b.split(" "),
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
for el in keywords_range_a:
|
| 540 |
+
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
| 541 |
+
continue
|
| 542 |
+
if el in keywords_range_b:
|
| 543 |
+
return False
|
| 544 |
+
|
| 545 |
+
# Japanese Exception
|
| 546 |
+
range_a_jp_chars, range_b_jp_chars = (
|
| 547 |
+
unicode_range_a
|
| 548 |
+
in (
|
| 549 |
+
"Hiragana",
|
| 550 |
+
"Katakana",
|
| 551 |
+
),
|
| 552 |
+
unicode_range_b in ("Hiragana", "Katakana"),
|
| 553 |
+
)
|
| 554 |
+
if (range_a_jp_chars or range_b_jp_chars) and (
|
| 555 |
+
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
| 556 |
+
):
|
| 557 |
+
return False
|
| 558 |
+
if range_a_jp_chars and range_b_jp_chars:
|
| 559 |
+
return False
|
| 560 |
+
|
| 561 |
+
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
| 562 |
+
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
| 563 |
+
return False
|
| 564 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
| 565 |
+
return False
|
| 566 |
+
|
| 567 |
+
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
| 568 |
+
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
| 569 |
+
unicode_range_a in ["Katakana", "Hiragana"]
|
| 570 |
+
and unicode_range_b in ["Katakana", "Hiragana"]
|
| 571 |
+
):
|
| 572 |
+
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
| 573 |
+
return False
|
| 574 |
+
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
| 575 |
+
return False
|
| 576 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
| 577 |
+
return False
|
| 578 |
+
|
| 579 |
+
return True
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
@lru_cache(maxsize=2048)
|
| 583 |
+
def mess_ratio(
|
| 584 |
+
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
| 585 |
+
) -> float:
|
| 586 |
+
"""
|
| 587 |
+
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
| 588 |
+
"""
|
| 589 |
+
|
| 590 |
+
detectors: list[MessDetectorPlugin] = [
|
| 591 |
+
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
| 592 |
+
]
|
| 593 |
+
|
| 594 |
+
length: int = len(decoded_sequence) + 1
|
| 595 |
+
|
| 596 |
+
mean_mess_ratio: float = 0.0
|
| 597 |
+
|
| 598 |
+
if length < 512:
|
| 599 |
+
intermediary_mean_mess_ratio_calc: int = 32
|
| 600 |
+
elif length <= 1024:
|
| 601 |
+
intermediary_mean_mess_ratio_calc = 64
|
| 602 |
+
else:
|
| 603 |
+
intermediary_mean_mess_ratio_calc = 128
|
| 604 |
+
|
| 605 |
+
for character, index in zip(decoded_sequence + "\n", range(length)):
|
| 606 |
+
for detector in detectors:
|
| 607 |
+
if detector.eligible(character):
|
| 608 |
+
detector.feed(character)
|
| 609 |
+
|
| 610 |
+
if (
|
| 611 |
+
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
| 612 |
+
) or index == length - 1:
|
| 613 |
+
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
| 614 |
+
|
| 615 |
+
if mean_mess_ratio >= maximum_threshold:
|
| 616 |
+
break
|
| 617 |
+
|
| 618 |
+
if debug:
|
| 619 |
+
logger = getLogger("charset_normalizer")
|
| 620 |
+
|
| 621 |
+
logger.log(
|
| 622 |
+
TRACE,
|
| 623 |
+
"Mess-detector extended-analysis start. "
|
| 624 |
+
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
| 625 |
+
f"maximum_threshold={maximum_threshold}",
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
if len(decoded_sequence) > 16:
|
| 629 |
+
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
| 630 |
+
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
| 631 |
+
|
| 632 |
+
for dt in detectors:
|
| 633 |
+
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
| 634 |
+
|
| 635 |
+
return round(mean_mess_ratio, 3)
|
venv/lib/python3.13/site-packages/charset_normalizer/models.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from encodings.aliases import aliases
|
| 4 |
+
from hashlib import sha256
|
| 5 |
+
from json import dumps
|
| 6 |
+
from re import sub
|
| 7 |
+
from typing import Any, Iterator, List, Tuple
|
| 8 |
+
|
| 9 |
+
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
| 10 |
+
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CharsetMatch:
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
payload: bytes,
|
| 17 |
+
guessed_encoding: str,
|
| 18 |
+
mean_mess_ratio: float,
|
| 19 |
+
has_sig_or_bom: bool,
|
| 20 |
+
languages: CoherenceMatches,
|
| 21 |
+
decoded_payload: str | None = None,
|
| 22 |
+
preemptive_declaration: str | None = None,
|
| 23 |
+
):
|
| 24 |
+
self._payload: bytes = payload
|
| 25 |
+
|
| 26 |
+
self._encoding: str = guessed_encoding
|
| 27 |
+
self._mean_mess_ratio: float = mean_mess_ratio
|
| 28 |
+
self._languages: CoherenceMatches = languages
|
| 29 |
+
self._has_sig_or_bom: bool = has_sig_or_bom
|
| 30 |
+
self._unicode_ranges: list[str] | None = None
|
| 31 |
+
|
| 32 |
+
self._leaves: list[CharsetMatch] = []
|
| 33 |
+
self._mean_coherence_ratio: float = 0.0
|
| 34 |
+
|
| 35 |
+
self._output_payload: bytes | None = None
|
| 36 |
+
self._output_encoding: str | None = None
|
| 37 |
+
|
| 38 |
+
self._string: str | None = decoded_payload
|
| 39 |
+
|
| 40 |
+
self._preemptive_declaration: str | None = preemptive_declaration
|
| 41 |
+
|
| 42 |
+
def __eq__(self, other: object) -> bool:
|
| 43 |
+
if not isinstance(other, CharsetMatch):
|
| 44 |
+
if isinstance(other, str):
|
| 45 |
+
return iana_name(other) == self.encoding
|
| 46 |
+
return False
|
| 47 |
+
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
| 48 |
+
|
| 49 |
+
def __lt__(self, other: object) -> bool:
|
| 50 |
+
"""
|
| 51 |
+
Implemented to make sorted available upon CharsetMatches items.
|
| 52 |
+
"""
|
| 53 |
+
if not isinstance(other, CharsetMatch):
|
| 54 |
+
raise ValueError
|
| 55 |
+
|
| 56 |
+
chaos_difference: float = abs(self.chaos - other.chaos)
|
| 57 |
+
coherence_difference: float = abs(self.coherence - other.coherence)
|
| 58 |
+
|
| 59 |
+
# Below 1% difference --> Use Coherence
|
| 60 |
+
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
| 61 |
+
return self.coherence > other.coherence
|
| 62 |
+
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
| 63 |
+
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
| 64 |
+
# preserve RAM usage!
|
| 65 |
+
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
| 66 |
+
return self.chaos < other.chaos
|
| 67 |
+
return self.multi_byte_usage > other.multi_byte_usage
|
| 68 |
+
|
| 69 |
+
return self.chaos < other.chaos
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def multi_byte_usage(self) -> float:
|
| 73 |
+
return 1.0 - (len(str(self)) / len(self.raw))
|
| 74 |
+
|
| 75 |
+
def __str__(self) -> str:
|
| 76 |
+
# Lazy Str Loading
|
| 77 |
+
if self._string is None:
|
| 78 |
+
self._string = str(self._payload, self._encoding, "strict")
|
| 79 |
+
return self._string
|
| 80 |
+
|
| 81 |
+
def __repr__(self) -> str:
|
| 82 |
+
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
| 83 |
+
|
| 84 |
+
def add_submatch(self, other: CharsetMatch) -> None:
|
| 85 |
+
if not isinstance(other, CharsetMatch) or other == self:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
| 88 |
+
other.__class__
|
| 89 |
+
)
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
other._string = None # Unload RAM usage; dirty trick.
|
| 93 |
+
self._leaves.append(other)
|
| 94 |
+
|
| 95 |
+
@property
|
| 96 |
+
def encoding(self) -> str:
|
| 97 |
+
return self._encoding
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def encoding_aliases(self) -> list[str]:
|
| 101 |
+
"""
|
| 102 |
+
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
| 103 |
+
"""
|
| 104 |
+
also_known_as: list[str] = []
|
| 105 |
+
for u, p in aliases.items():
|
| 106 |
+
if self.encoding == u:
|
| 107 |
+
also_known_as.append(p)
|
| 108 |
+
elif self.encoding == p:
|
| 109 |
+
also_known_as.append(u)
|
| 110 |
+
return also_known_as
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def bom(self) -> bool:
|
| 114 |
+
return self._has_sig_or_bom
|
| 115 |
+
|
| 116 |
+
@property
|
| 117 |
+
def byte_order_mark(self) -> bool:
|
| 118 |
+
return self._has_sig_or_bom
|
| 119 |
+
|
| 120 |
+
@property
|
| 121 |
+
def languages(self) -> list[str]:
|
| 122 |
+
"""
|
| 123 |
+
Return the complete list of possible languages found in decoded sequence.
|
| 124 |
+
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
| 125 |
+
"""
|
| 126 |
+
return [e[0] for e in self._languages]
|
| 127 |
+
|
| 128 |
+
@property
|
| 129 |
+
def language(self) -> str:
|
| 130 |
+
"""
|
| 131 |
+
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
| 132 |
+
"Unknown".
|
| 133 |
+
"""
|
| 134 |
+
if not self._languages:
|
| 135 |
+
# Trying to infer the language based on the given encoding
|
| 136 |
+
# Its either English or we should not pronounce ourselves in certain cases.
|
| 137 |
+
if "ascii" in self.could_be_from_charset:
|
| 138 |
+
return "English"
|
| 139 |
+
|
| 140 |
+
# doing it there to avoid circular import
|
| 141 |
+
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
| 142 |
+
|
| 143 |
+
languages = (
|
| 144 |
+
mb_encoding_languages(self.encoding)
|
| 145 |
+
if is_multi_byte_encoding(self.encoding)
|
| 146 |
+
else encoding_languages(self.encoding)
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if len(languages) == 0 or "Latin Based" in languages:
|
| 150 |
+
return "Unknown"
|
| 151 |
+
|
| 152 |
+
return languages[0]
|
| 153 |
+
|
| 154 |
+
return self._languages[0][0]
|
| 155 |
+
|
| 156 |
+
@property
|
| 157 |
+
def chaos(self) -> float:
|
| 158 |
+
return self._mean_mess_ratio
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def coherence(self) -> float:
|
| 162 |
+
if not self._languages:
|
| 163 |
+
return 0.0
|
| 164 |
+
return self._languages[0][1]
|
| 165 |
+
|
| 166 |
+
@property
|
| 167 |
+
def percent_chaos(self) -> float:
|
| 168 |
+
return round(self.chaos * 100, ndigits=3)
|
| 169 |
+
|
| 170 |
+
@property
|
| 171 |
+
def percent_coherence(self) -> float:
|
| 172 |
+
return round(self.coherence * 100, ndigits=3)
|
| 173 |
+
|
| 174 |
+
@property
|
| 175 |
+
def raw(self) -> bytes:
|
| 176 |
+
"""
|
| 177 |
+
Original untouched bytes.
|
| 178 |
+
"""
|
| 179 |
+
return self._payload
|
| 180 |
+
|
| 181 |
+
@property
|
| 182 |
+
def submatch(self) -> list[CharsetMatch]:
|
| 183 |
+
return self._leaves
|
| 184 |
+
|
| 185 |
+
@property
|
| 186 |
+
def has_submatch(self) -> bool:
|
| 187 |
+
return len(self._leaves) > 0
|
| 188 |
+
|
| 189 |
+
@property
|
| 190 |
+
def alphabets(self) -> list[str]:
|
| 191 |
+
if self._unicode_ranges is not None:
|
| 192 |
+
return self._unicode_ranges
|
| 193 |
+
# list detected ranges
|
| 194 |
+
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
| 195 |
+
# filter and sort
|
| 196 |
+
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
| 197 |
+
return self._unicode_ranges
|
| 198 |
+
|
| 199 |
+
@property
|
| 200 |
+
def could_be_from_charset(self) -> list[str]:
|
| 201 |
+
"""
|
| 202 |
+
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
| 203 |
+
encoding.
|
| 204 |
+
This list does include the encoding available in property 'encoding'.
|
| 205 |
+
"""
|
| 206 |
+
return [self._encoding] + [m.encoding for m in self._leaves]
|
| 207 |
+
|
| 208 |
+
def output(self, encoding: str = "utf_8") -> bytes:
|
| 209 |
+
"""
|
| 210 |
+
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
| 211 |
+
Any errors will be simply ignored by the encoder NOT replaced.
|
| 212 |
+
"""
|
| 213 |
+
if self._output_encoding is None or self._output_encoding != encoding:
|
| 214 |
+
self._output_encoding = encoding
|
| 215 |
+
decoded_string = str(self)
|
| 216 |
+
if (
|
| 217 |
+
self._preemptive_declaration is not None
|
| 218 |
+
and self._preemptive_declaration.lower()
|
| 219 |
+
not in ["utf-8", "utf8", "utf_8"]
|
| 220 |
+
):
|
| 221 |
+
patched_header = sub(
|
| 222 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 223 |
+
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
| 224 |
+
m.groups()[0],
|
| 225 |
+
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
| 226 |
+
),
|
| 227 |
+
decoded_string[:8192],
|
| 228 |
+
count=1,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
decoded_string = patched_header + decoded_string[8192:]
|
| 232 |
+
|
| 233 |
+
self._output_payload = decoded_string.encode(encoding, "replace")
|
| 234 |
+
|
| 235 |
+
return self._output_payload # type: ignore
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def fingerprint(self) -> str:
|
| 239 |
+
"""
|
| 240 |
+
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
| 241 |
+
"""
|
| 242 |
+
return sha256(self.output()).hexdigest()
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class CharsetMatches:
|
| 246 |
+
"""
|
| 247 |
+
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
| 248 |
+
Act like a list(iterable) but does not implements all related methods.
|
| 249 |
+
"""
|
| 250 |
+
|
| 251 |
+
def __init__(self, results: list[CharsetMatch] | None = None):
|
| 252 |
+
self._results: list[CharsetMatch] = sorted(results) if results else []
|
| 253 |
+
|
| 254 |
+
def __iter__(self) -> Iterator[CharsetMatch]:
|
| 255 |
+
yield from self._results
|
| 256 |
+
|
| 257 |
+
def __getitem__(self, item: int | str) -> CharsetMatch:
|
| 258 |
+
"""
|
| 259 |
+
Retrieve a single item either by its position or encoding name (alias may be used here).
|
| 260 |
+
Raise KeyError upon invalid index or encoding not present in results.
|
| 261 |
+
"""
|
| 262 |
+
if isinstance(item, int):
|
| 263 |
+
return self._results[item]
|
| 264 |
+
if isinstance(item, str):
|
| 265 |
+
item = iana_name(item, False)
|
| 266 |
+
for result in self._results:
|
| 267 |
+
if item in result.could_be_from_charset:
|
| 268 |
+
return result
|
| 269 |
+
raise KeyError
|
| 270 |
+
|
| 271 |
+
def __len__(self) -> int:
|
| 272 |
+
return len(self._results)
|
| 273 |
+
|
| 274 |
+
def __bool__(self) -> bool:
|
| 275 |
+
return len(self._results) > 0
|
| 276 |
+
|
| 277 |
+
def append(self, item: CharsetMatch) -> None:
|
| 278 |
+
"""
|
| 279 |
+
Insert a single match. Will be inserted accordingly to preserve sort.
|
| 280 |
+
Can be inserted as a submatch.
|
| 281 |
+
"""
|
| 282 |
+
if not isinstance(item, CharsetMatch):
|
| 283 |
+
raise ValueError(
|
| 284 |
+
"Cannot append instance '{}' to CharsetMatches".format(
|
| 285 |
+
str(item.__class__)
|
| 286 |
+
)
|
| 287 |
+
)
|
| 288 |
+
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
| 289 |
+
if len(item.raw) < TOO_BIG_SEQUENCE:
|
| 290 |
+
for match in self._results:
|
| 291 |
+
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
| 292 |
+
match.add_submatch(item)
|
| 293 |
+
return
|
| 294 |
+
self._results.append(item)
|
| 295 |
+
self._results = sorted(self._results)
|
| 296 |
+
|
| 297 |
+
def best(self) -> CharsetMatch | None:
|
| 298 |
+
"""
|
| 299 |
+
Simply return the first match. Strict equivalent to matches[0].
|
| 300 |
+
"""
|
| 301 |
+
if not self._results:
|
| 302 |
+
return None
|
| 303 |
+
return self._results[0]
|
| 304 |
+
|
| 305 |
+
def first(self) -> CharsetMatch | None:
|
| 306 |
+
"""
|
| 307 |
+
Redundant method, call the method best(). Kept for BC reasons.
|
| 308 |
+
"""
|
| 309 |
+
return self.best()
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
CoherenceMatch = Tuple[str, float]
|
| 313 |
+
CoherenceMatches = List[CoherenceMatch]
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
class CliDetectionResult:
|
| 317 |
+
def __init__(
|
| 318 |
+
self,
|
| 319 |
+
path: str,
|
| 320 |
+
encoding: str | None,
|
| 321 |
+
encoding_aliases: list[str],
|
| 322 |
+
alternative_encodings: list[str],
|
| 323 |
+
language: str,
|
| 324 |
+
alphabets: list[str],
|
| 325 |
+
has_sig_or_bom: bool,
|
| 326 |
+
chaos: float,
|
| 327 |
+
coherence: float,
|
| 328 |
+
unicode_path: str | None,
|
| 329 |
+
is_preferred: bool,
|
| 330 |
+
):
|
| 331 |
+
self.path: str = path
|
| 332 |
+
self.unicode_path: str | None = unicode_path
|
| 333 |
+
self.encoding: str | None = encoding
|
| 334 |
+
self.encoding_aliases: list[str] = encoding_aliases
|
| 335 |
+
self.alternative_encodings: list[str] = alternative_encodings
|
| 336 |
+
self.language: str = language
|
| 337 |
+
self.alphabets: list[str] = alphabets
|
| 338 |
+
self.has_sig_or_bom: bool = has_sig_or_bom
|
| 339 |
+
self.chaos: float = chaos
|
| 340 |
+
self.coherence: float = coherence
|
| 341 |
+
self.is_preferred: bool = is_preferred
|
| 342 |
+
|
| 343 |
+
@property
|
| 344 |
+
def __dict__(self) -> dict[str, Any]: # type: ignore
|
| 345 |
+
return {
|
| 346 |
+
"path": self.path,
|
| 347 |
+
"encoding": self.encoding,
|
| 348 |
+
"encoding_aliases": self.encoding_aliases,
|
| 349 |
+
"alternative_encodings": self.alternative_encodings,
|
| 350 |
+
"language": self.language,
|
| 351 |
+
"alphabets": self.alphabets,
|
| 352 |
+
"has_sig_or_bom": self.has_sig_or_bom,
|
| 353 |
+
"chaos": self.chaos,
|
| 354 |
+
"coherence": self.coherence,
|
| 355 |
+
"unicode_path": self.unicode_path,
|
| 356 |
+
"is_preferred": self.is_preferred,
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
def to_json(self) -> str:
|
| 360 |
+
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
venv/lib/python3.13/site-packages/charset_normalizer/py.typed
ADDED
|
File without changes
|
venv/lib/python3.13/site-packages/charset_normalizer/utils.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
import logging
|
| 5 |
+
import unicodedata
|
| 6 |
+
from codecs import IncrementalDecoder
|
| 7 |
+
from encodings.aliases import aliases
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from re import findall
|
| 10 |
+
from typing import Generator
|
| 11 |
+
|
| 12 |
+
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
| 13 |
+
MultibyteIncrementalDecoder,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
from .constant import (
|
| 17 |
+
ENCODING_MARKS,
|
| 18 |
+
IANA_SUPPORTED_SIMILAR,
|
| 19 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 20 |
+
UNICODE_RANGES_COMBINED,
|
| 21 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
| 22 |
+
UTF8_MAXIMAL_ALLOCATION,
|
| 23 |
+
COMMON_CJK_CHARACTERS,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 28 |
+
def is_accentuated(character: str) -> bool:
|
| 29 |
+
try:
|
| 30 |
+
description: str = unicodedata.name(character)
|
| 31 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 32 |
+
return False
|
| 33 |
+
return (
|
| 34 |
+
"WITH GRAVE" in description
|
| 35 |
+
or "WITH ACUTE" in description
|
| 36 |
+
or "WITH CEDILLA" in description
|
| 37 |
+
or "WITH DIAERESIS" in description
|
| 38 |
+
or "WITH CIRCUMFLEX" in description
|
| 39 |
+
or "WITH TILDE" in description
|
| 40 |
+
or "WITH MACRON" in description
|
| 41 |
+
or "WITH RING ABOVE" in description
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 46 |
+
def remove_accent(character: str) -> str:
|
| 47 |
+
decomposed: str = unicodedata.decomposition(character)
|
| 48 |
+
if not decomposed:
|
| 49 |
+
return character
|
| 50 |
+
|
| 51 |
+
codes: list[str] = decomposed.split(" ")
|
| 52 |
+
|
| 53 |
+
return chr(int(codes[0], 16))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 57 |
+
def unicode_range(character: str) -> str | None:
|
| 58 |
+
"""
|
| 59 |
+
Retrieve the Unicode range official name from a single character.
|
| 60 |
+
"""
|
| 61 |
+
character_ord: int = ord(character)
|
| 62 |
+
|
| 63 |
+
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
| 64 |
+
if character_ord in ord_range:
|
| 65 |
+
return range_name
|
| 66 |
+
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 71 |
+
def is_latin(character: str) -> bool:
|
| 72 |
+
try:
|
| 73 |
+
description: str = unicodedata.name(character)
|
| 74 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 75 |
+
return False
|
| 76 |
+
return "LATIN" in description
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 80 |
+
def is_punctuation(character: str) -> bool:
|
| 81 |
+
character_category: str = unicodedata.category(character)
|
| 82 |
+
|
| 83 |
+
if "P" in character_category:
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
character_range: str | None = unicode_range(character)
|
| 87 |
+
|
| 88 |
+
if character_range is None:
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
return "Punctuation" in character_range
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 95 |
+
def is_symbol(character: str) -> bool:
|
| 96 |
+
character_category: str = unicodedata.category(character)
|
| 97 |
+
|
| 98 |
+
if "S" in character_category or "N" in character_category:
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
character_range: str | None = unicode_range(character)
|
| 102 |
+
|
| 103 |
+
if character_range is None:
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
return "Forms" in character_range and character_category != "Lo"
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 110 |
+
def is_emoticon(character: str) -> bool:
|
| 111 |
+
character_range: str | None = unicode_range(character)
|
| 112 |
+
|
| 113 |
+
if character_range is None:
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
return "Emoticons" in character_range or "Pictographs" in character_range
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 120 |
+
def is_separator(character: str) -> bool:
|
| 121 |
+
if character.isspace() or character in {"|", "+", "<", ">"}:
|
| 122 |
+
return True
|
| 123 |
+
|
| 124 |
+
character_category: str = unicodedata.category(character)
|
| 125 |
+
|
| 126 |
+
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 130 |
+
def is_case_variable(character: str) -> bool:
|
| 131 |
+
return character.islower() != character.isupper()
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 135 |
+
def is_cjk(character: str) -> bool:
|
| 136 |
+
try:
|
| 137 |
+
character_name = unicodedata.name(character)
|
| 138 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
return "CJK" in character_name
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 145 |
+
def is_hiragana(character: str) -> bool:
|
| 146 |
+
try:
|
| 147 |
+
character_name = unicodedata.name(character)
|
| 148 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
return "HIRAGANA" in character_name
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 155 |
+
def is_katakana(character: str) -> bool:
|
| 156 |
+
try:
|
| 157 |
+
character_name = unicodedata.name(character)
|
| 158 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
return "KATAKANA" in character_name
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 165 |
+
def is_hangul(character: str) -> bool:
|
| 166 |
+
try:
|
| 167 |
+
character_name = unicodedata.name(character)
|
| 168 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
return "HANGUL" in character_name
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 175 |
+
def is_thai(character: str) -> bool:
|
| 176 |
+
try:
|
| 177 |
+
character_name = unicodedata.name(character)
|
| 178 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
return "THAI" in character_name
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 185 |
+
def is_arabic(character: str) -> bool:
|
| 186 |
+
try:
|
| 187 |
+
character_name = unicodedata.name(character)
|
| 188 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
return "ARABIC" in character_name
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 195 |
+
def is_arabic_isolated_form(character: str) -> bool:
|
| 196 |
+
try:
|
| 197 |
+
character_name = unicodedata.name(character)
|
| 198 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 199 |
+
return False
|
| 200 |
+
|
| 201 |
+
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 205 |
+
def is_cjk_uncommon(character: str) -> bool:
|
| 206 |
+
return character not in COMMON_CJK_CHARACTERS
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
| 210 |
+
def is_unicode_range_secondary(range_name: str) -> bool:
|
| 211 |
+
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 215 |
+
def is_unprintable(character: str) -> bool:
|
| 216 |
+
return (
|
| 217 |
+
character.isspace() is False # includes \n \t \r \v
|
| 218 |
+
and character.isprintable() is False
|
| 219 |
+
and character != "\x1a" # Why? Its the ASCII substitute character.
|
| 220 |
+
and character != "\ufeff" # bug discovered in Python,
|
| 221 |
+
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
| 226 |
+
"""
|
| 227 |
+
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
| 228 |
+
"""
|
| 229 |
+
if not isinstance(sequence, bytes):
|
| 230 |
+
raise TypeError
|
| 231 |
+
|
| 232 |
+
seq_len: int = len(sequence)
|
| 233 |
+
|
| 234 |
+
results: list[str] = findall(
|
| 235 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 236 |
+
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
if len(results) == 0:
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
for specified_encoding in results:
|
| 243 |
+
specified_encoding = specified_encoding.lower().replace("-", "_")
|
| 244 |
+
|
| 245 |
+
encoding_alias: str
|
| 246 |
+
encoding_iana: str
|
| 247 |
+
|
| 248 |
+
for encoding_alias, encoding_iana in aliases.items():
|
| 249 |
+
if encoding_alias == specified_encoding:
|
| 250 |
+
return encoding_iana
|
| 251 |
+
if encoding_iana == specified_encoding:
|
| 252 |
+
return encoding_iana
|
| 253 |
+
|
| 254 |
+
return None
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@lru_cache(maxsize=128)
|
| 258 |
+
def is_multi_byte_encoding(name: str) -> bool:
|
| 259 |
+
"""
|
| 260 |
+
Verify is a specific encoding is a multi byte one based on it IANA name
|
| 261 |
+
"""
|
| 262 |
+
return name in {
|
| 263 |
+
"utf_8",
|
| 264 |
+
"utf_8_sig",
|
| 265 |
+
"utf_16",
|
| 266 |
+
"utf_16_be",
|
| 267 |
+
"utf_16_le",
|
| 268 |
+
"utf_32",
|
| 269 |
+
"utf_32_le",
|
| 270 |
+
"utf_32_be",
|
| 271 |
+
"utf_7",
|
| 272 |
+
} or issubclass(
|
| 273 |
+
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
| 274 |
+
MultibyteIncrementalDecoder,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
| 279 |
+
"""
|
| 280 |
+
Identify and extract SIG/BOM in given sequence.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
for iana_encoding in ENCODING_MARKS:
|
| 284 |
+
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
| 285 |
+
|
| 286 |
+
if isinstance(marks, bytes):
|
| 287 |
+
marks = [marks]
|
| 288 |
+
|
| 289 |
+
for mark in marks:
|
| 290 |
+
if sequence.startswith(mark):
|
| 291 |
+
return iana_encoding, mark
|
| 292 |
+
|
| 293 |
+
return None, b""
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
| 297 |
+
return iana_encoding not in {"utf_16", "utf_32"}
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def iana_name(cp_name: str, strict: bool = True) -> str:
|
| 301 |
+
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
| 302 |
+
cp_name = cp_name.lower().replace("-", "_")
|
| 303 |
+
|
| 304 |
+
encoding_alias: str
|
| 305 |
+
encoding_iana: str
|
| 306 |
+
|
| 307 |
+
for encoding_alias, encoding_iana in aliases.items():
|
| 308 |
+
if cp_name in [encoding_alias, encoding_iana]:
|
| 309 |
+
return encoding_iana
|
| 310 |
+
|
| 311 |
+
if strict:
|
| 312 |
+
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
| 313 |
+
|
| 314 |
+
return cp_name
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
| 318 |
+
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
| 319 |
+
return 0.0
|
| 320 |
+
|
| 321 |
+
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
| 322 |
+
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
| 323 |
+
|
| 324 |
+
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
| 325 |
+
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
| 326 |
+
|
| 327 |
+
character_match_count: int = 0
|
| 328 |
+
|
| 329 |
+
for i in range(255):
|
| 330 |
+
to_be_decoded: bytes = bytes([i])
|
| 331 |
+
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
| 332 |
+
character_match_count += 1
|
| 333 |
+
|
| 334 |
+
return character_match_count / 254
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
| 338 |
+
"""
|
| 339 |
+
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
| 340 |
+
the function cp_similarity.
|
| 341 |
+
"""
|
| 342 |
+
return (
|
| 343 |
+
iana_name_a in IANA_SUPPORTED_SIMILAR
|
| 344 |
+
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def set_logging_handler(
|
| 349 |
+
name: str = "charset_normalizer",
|
| 350 |
+
level: int = logging.INFO,
|
| 351 |
+
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
| 352 |
+
) -> None:
|
| 353 |
+
logger = logging.getLogger(name)
|
| 354 |
+
logger.setLevel(level)
|
| 355 |
+
|
| 356 |
+
handler = logging.StreamHandler()
|
| 357 |
+
handler.setFormatter(logging.Formatter(format_string))
|
| 358 |
+
logger.addHandler(handler)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def cut_sequence_chunks(
|
| 362 |
+
sequences: bytes,
|
| 363 |
+
encoding_iana: str,
|
| 364 |
+
offsets: range,
|
| 365 |
+
chunk_size: int,
|
| 366 |
+
bom_or_sig_available: bool,
|
| 367 |
+
strip_sig_or_bom: bool,
|
| 368 |
+
sig_payload: bytes,
|
| 369 |
+
is_multi_byte_decoder: bool,
|
| 370 |
+
decoded_payload: str | None = None,
|
| 371 |
+
) -> Generator[str, None, None]:
|
| 372 |
+
if decoded_payload and is_multi_byte_decoder is False:
|
| 373 |
+
for i in offsets:
|
| 374 |
+
chunk = decoded_payload[i : i + chunk_size]
|
| 375 |
+
if not chunk:
|
| 376 |
+
break
|
| 377 |
+
yield chunk
|
| 378 |
+
else:
|
| 379 |
+
for i in offsets:
|
| 380 |
+
chunk_end = i + chunk_size
|
| 381 |
+
if chunk_end > len(sequences) + 8:
|
| 382 |
+
continue
|
| 383 |
+
|
| 384 |
+
cut_sequence = sequences[i : i + chunk_size]
|
| 385 |
+
|
| 386 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
| 387 |
+
cut_sequence = sig_payload + cut_sequence
|
| 388 |
+
|
| 389 |
+
chunk = cut_sequence.decode(
|
| 390 |
+
encoding_iana,
|
| 391 |
+
errors="ignore" if is_multi_byte_decoder else "strict",
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# multi-byte bad cutting detector and adjustment
|
| 395 |
+
# not the cleanest way to perform that fix but clever enough for now.
|
| 396 |
+
if is_multi_byte_decoder and i > 0:
|
| 397 |
+
chunk_partial_size_chk: int = min(chunk_size, 16)
|
| 398 |
+
|
| 399 |
+
if (
|
| 400 |
+
decoded_payload
|
| 401 |
+
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
| 402 |
+
):
|
| 403 |
+
for j in range(i, i - 4, -1):
|
| 404 |
+
cut_sequence = sequences[j:chunk_end]
|
| 405 |
+
|
| 406 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
| 407 |
+
cut_sequence = sig_payload + cut_sequence
|
| 408 |
+
|
| 409 |
+
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
| 410 |
+
|
| 411 |
+
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
| 412 |
+
break
|
| 413 |
+
|
| 414 |
+
yield chunk
|
venv/lib/python3.13/site-packages/charset_normalizer/version.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Expose version
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
__version__ = "3.4.4"
|
| 8 |
+
VERSION = __version__.split(".")
|
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: filelock
|
| 3 |
+
Version: 3.20.0
|
| 4 |
+
Summary: A platform independent file lock.
|
| 5 |
+
Project-URL: Documentation, https://py-filelock.readthedocs.io
|
| 6 |
+
Project-URL: Homepage, https://github.com/tox-dev/py-filelock
|
| 7 |
+
Project-URL: Source, https://github.com/tox-dev/py-filelock
|
| 8 |
+
Project-URL: Tracker, https://github.com/tox-dev/py-filelock/issues
|
| 9 |
+
Maintainer-email: Bernát Gábor <gaborjbernat@gmail.com>
|
| 10 |
+
License-Expression: Unlicense
|
| 11 |
+
License-File: LICENSE
|
| 12 |
+
Keywords: application,cache,directory,log,user
|
| 13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 14 |
+
Classifier: Intended Audience :: Developers
|
| 15 |
+
Classifier: License :: OSI Approved :: The Unlicense (Unlicense)
|
| 16 |
+
Classifier: Operating System :: OS Independent
|
| 17 |
+
Classifier: Programming Language :: Python
|
| 18 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 24 |
+
Classifier: Topic :: Internet
|
| 25 |
+
Classifier: Topic :: Software Development :: Libraries
|
| 26 |
+
Classifier: Topic :: System
|
| 27 |
+
Requires-Python: >=3.10
|
| 28 |
+
Description-Content-Type: text/markdown
|
| 29 |
+
|
| 30 |
+
# filelock
|
| 31 |
+
|
| 32 |
+
[](https://pypi.org/project/filelock/)
|
| 33 |
+
[](https://pypi.org/project/filelock/)
|
| 35 |
+
[](https://py-filelock.readthedocs.io/en/latest/?badge=latest)
|
| 37 |
+
[](https://github.com/psf/black)
|
| 39 |
+
[](https://pepy.tech/project/filelock)
|
| 40 |
+
[](https://github.com/tox-dev/py-filelock/actions/workflows/check.yaml)
|
| 41 |
+
|
| 42 |
+
For more information checkout the [official documentation](https://py-filelock.readthedocs.io/en/latest/index.html).
|
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
filelock-3.20.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
filelock-3.20.0.dist-info/METADATA,sha256=gIghqdcbGNywxw52pN02_a9OxFqzhjA8v-9GsDWtNog,2110
|
| 3 |
+
filelock-3.20.0.dist-info/RECORD,,
|
| 4 |
+
filelock-3.20.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
| 5 |
+
filelock-3.20.0.dist-info/licenses/LICENSE,sha256=iNm062BXnBkew5HKBMFhMFctfu3EqG2qWL8oxuFMm80,1210
|
| 6 |
+
filelock/__init__.py,sha256=_t_-OAGXo_qyPa9lNQ1YnzVYEvSW3I0onPqzpomsVVg,1769
|
| 7 |
+
filelock/__pycache__/__init__.cpython-313.pyc,,
|
| 8 |
+
filelock/__pycache__/_api.cpython-313.pyc,,
|
| 9 |
+
filelock/__pycache__/_error.cpython-313.pyc,,
|
| 10 |
+
filelock/__pycache__/_soft.cpython-313.pyc,,
|
| 11 |
+
filelock/__pycache__/_unix.cpython-313.pyc,,
|
| 12 |
+
filelock/__pycache__/_util.cpython-313.pyc,,
|
| 13 |
+
filelock/__pycache__/_windows.cpython-313.pyc,,
|
| 14 |
+
filelock/__pycache__/asyncio.cpython-313.pyc,,
|
| 15 |
+
filelock/__pycache__/version.cpython-313.pyc,,
|
| 16 |
+
filelock/_api.py,sha256=2aATBeJ3-jtMj5OSm7EE539iNaTBsf13KXtcBMoi8oM,14545
|
| 17 |
+
filelock/_error.py,sha256=-5jMcjTu60YAvAO1UbqDD1GIEjVkwr8xCFwDBtMeYDg,787
|
| 18 |
+
filelock/_soft.py,sha256=haqtc_TB_KJbYv2a8iuEAclKuM4fMG1vTcp28sK919c,1711
|
| 19 |
+
filelock/_unix.py,sha256=eGOs4gDgZ-5fGnJUz-OkJDeZkAMzgvYcD8hVD6XH7e4,2351
|
| 20 |
+
filelock/_util.py,sha256=QHBoNFIYfbAThhotH3Q8E2acFc84wpG49-T-uu017ZE,1715
|
| 21 |
+
filelock/_windows.py,sha256=8k4XIBl_zZVfGC2gz0kEr8DZBvpNa8wdU9qeM1YrBb8,2179
|
| 22 |
+
filelock/asyncio.py,sha256=dSLe6XZSECFOgsVpcQUSh5Y5zAHxHGPu_tfpPX9I45k,12514
|
| 23 |
+
filelock/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 24 |
+
filelock/version.py,sha256=AW5MeEjK4TaQWWJrGb_AlBw8PlmFoIcn7GodG_AVSOM,706
|
venv/lib/python3.13/site-packages/filelock-3.20.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: hatchling 1.27.0
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: hf-xet
|
| 3 |
+
Version: 1.2.0
|
| 4 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 5 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
| 6 |
+
Classifier: Programming Language :: Rust
|
| 7 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 8 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 9 |
+
Classifier: Programming Language :: Python :: 3
|
| 10 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 11 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 12 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 13 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 18 |
+
Classifier: Programming Language :: Python :: Free Threading
|
| 19 |
+
Classifier: Programming Language :: Python :: Free Threading :: 2 - Beta
|
| 20 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 21 |
+
Requires-Dist: pytest ; extra == 'tests'
|
| 22 |
+
Provides-Extra: tests
|
| 23 |
+
License-File: LICENSE
|
| 24 |
+
Summary: Fast transfer of large files with the Hugging Face Hub.
|
| 25 |
+
Maintainer-email: Rajat Arya <rajat@rajatarya.com>, Jared Sulzdorf <j.sulzdorf@gmail.com>, Di Xiao <di@huggingface.co>, Assaf Vayner <assaf@huggingface.co>, Hoyt Koepke <hoytak@gmail.com>
|
| 26 |
+
License-Expression: Apache-2.0
|
| 27 |
+
Requires-Python: >=3.8
|
| 28 |
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
| 29 |
+
Project-URL: Homepage, https://github.com/huggingface/xet-core
|
| 30 |
+
Project-URL: Documentation, https://huggingface.co/docs/hub/en/storage-backends#using-xet-storage
|
| 31 |
+
Project-URL: Issues, https://github.com/huggingface/xet-core/issues
|
| 32 |
+
Project-URL: Repository, https://github.com/huggingface/xet-core.git
|
| 33 |
+
|
| 34 |
+
<!---
|
| 35 |
+
Copyright 2024 The HuggingFace Team. All rights reserved.
|
| 36 |
+
|
| 37 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 38 |
+
you may not use this file except in compliance with the License.
|
| 39 |
+
You may obtain a copy of the License at
|
| 40 |
+
|
| 41 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 42 |
+
|
| 43 |
+
Unless required by applicable law or agreed to in writing, software
|
| 44 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 45 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 46 |
+
See the License for the specific language governing permissions and
|
| 47 |
+
limitations under the License.
|
| 48 |
+
-->
|
| 49 |
+
<p align="center">
|
| 50 |
+
<a href="https://github.com/huggingface/xet-core/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/huggingface/xet-core.svg?color=blue"></a>
|
| 51 |
+
<a href="https://github.com/huggingface/xet-core/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/xet-core.svg"></a>
|
| 52 |
+
<a href="https://github.com/huggingface/xet-core/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
|
| 53 |
+
</p>
|
| 54 |
+
|
| 55 |
+
<h3 align="center">
|
| 56 |
+
<p>🤗 hf-xet - xet client tech, used in <a target="_blank" href="https://github.com/huggingface/huggingface_hub/">huggingface_hub</a></p>
|
| 57 |
+
</h3>
|
| 58 |
+
|
| 59 |
+
## Welcome
|
| 60 |
+
|
| 61 |
+
`hf-xet` enables `huggingface_hub` to utilize xet storage for uploading and downloading to HF Hub. Xet storage provides chunk-based deduplication, efficient storage/retrieval with local disk caching, and backwards compatibility with Git LFS. This library is not meant to be used directly, and is instead intended to be used from [huggingface_hub](https://pypi.org/project/huggingface-hub).
|
| 62 |
+
|
| 63 |
+
## Key features
|
| 64 |
+
|
| 65 |
+
♻ **chunk-based deduplication implementation**: avoid transferring and storing chunks that are shared across binary files (models, datasets, etc).
|
| 66 |
+
|
| 67 |
+
🤗 **Python bindings**: bindings for [huggingface_hub](https://github.com/huggingface/huggingface_hub/) package.
|
| 68 |
+
|
| 69 |
+
↔ **network communications**: concurrent communication to HF Hub Xet backend services (CAS).
|
| 70 |
+
|
| 71 |
+
🔖 **local disk caching**: chunk-based cache that sits alongside the existing [huggingface_hub disk cache](https://huggingface.co/docs/huggingface_hub/guides/manage-cache).
|
| 72 |
+
|
| 73 |
+
## Installation
|
| 74 |
+
|
| 75 |
+
Install the `hf_xet` package with [pip](https://pypi.org/project/hf-xet/):
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
pip install hf_xet
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## Quick Start
|
| 82 |
+
|
| 83 |
+
`hf_xet` is not intended to be run independently as it is expected to be used from `huggingface_hub`, so to get started with `huggingface_hub` check out the documentation [here]("https://hf.co/docs/huggingface_hub").
|
| 84 |
+
|
| 85 |
+
## Contributions (feature requests, bugs, etc.) are encouraged & appreciated 💙💚💛💜🧡❤️
|
| 86 |
+
|
| 87 |
+
Please join us in making hf-xet better. We value everyone's contributions. Code is not the only way to help. Answering questions, helping each other, improving documentation, filing issues all help immensely. If you are interested in contributing (please do!), check out the [contribution guide](https://github.com/huggingface/xet-core/blob/main/CONTRIBUTING.md) for this repository.
|
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hf_xet-1.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
hf_xet-1.2.0.dist-info/METADATA,sha256=U-3J7DnI-UycsH-OPV_q2_s3jhtJSkQYifQ03yS9ie8,4910
|
| 3 |
+
hf_xet-1.2.0.dist-info/RECORD,,
|
| 4 |
+
hf_xet-1.2.0.dist-info/WHEEL,sha256=W1f4mZCUZH4n5LoWwHgwGsB1zJCLLADdZ7x6Gd7Z8X8,127
|
| 5 |
+
hf_xet-1.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
| 6 |
+
hf_xet/__init__.py,sha256=E8UDdyQ8glZ_nve9hHEf22bPang8-RKx4VuApXYeQUo,107
|
| 7 |
+
hf_xet/__pycache__/__init__.cpython-313.pyc,,
|
| 8 |
+
hf_xet/hf_xet.abi3.so,sha256=vddURwHuQEUiJXuQlm1NZ47kOW5ck53KFdD32s2mDmY,8310504
|
venv/lib/python3.13/site-packages/hf_xet-1.2.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: maturin (1.9.6)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64
|
venv/lib/python3.13/site-packages/huggingface_hub/__init__.py
ADDED
|
@@ -0,0 +1,1554 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# ***********
|
| 16 |
+
# `huggingface_hub` init has 2 modes:
|
| 17 |
+
# - Normal usage:
|
| 18 |
+
# If imported to use it, all modules and functions are lazy-loaded. This means
|
| 19 |
+
# they exist at top level in module but are imported only the first time they are
|
| 20 |
+
# used. This way, `from huggingface_hub import something` will import `something`
|
| 21 |
+
# quickly without the hassle of importing all the features from `huggingface_hub`.
|
| 22 |
+
# - Static check:
|
| 23 |
+
# If statically analyzed, all modules and functions are loaded normally. This way
|
| 24 |
+
# static typing check works properly as well as autocomplete in text editors and
|
| 25 |
+
# IDEs.
|
| 26 |
+
#
|
| 27 |
+
# The static model imports are done inside the `if TYPE_CHECKING:` statement at
|
| 28 |
+
# the bottom of this file. Since module/functions imports are duplicated, it is
|
| 29 |
+
# mandatory to make sure to add them twice when adding one. This is checked in the
|
| 30 |
+
# `make quality` command.
|
| 31 |
+
#
|
| 32 |
+
# To update the static imports, please run the following command and commit the changes.
|
| 33 |
+
# ```
|
| 34 |
+
# # Use script
|
| 35 |
+
# python utils/check_static_imports.py --update-file
|
| 36 |
+
#
|
| 37 |
+
# # Or run style on codebase
|
| 38 |
+
# make style
|
| 39 |
+
# ```
|
| 40 |
+
#
|
| 41 |
+
# ***********
|
| 42 |
+
# Lazy loader vendored from https://github.com/scientific-python/lazy_loader
|
| 43 |
+
import importlib
|
| 44 |
+
import os
|
| 45 |
+
import sys
|
| 46 |
+
from typing import TYPE_CHECKING
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
__version__ = "0.36.0"
|
| 50 |
+
|
| 51 |
+
# Alphabetical order of definitions is ensured in tests
|
| 52 |
+
# WARNING: any comment added in this dictionary definition will be lost when
|
| 53 |
+
# re-generating the file !
|
| 54 |
+
_SUBMOD_ATTRS = {
|
| 55 |
+
"_commit_scheduler": [
|
| 56 |
+
"CommitScheduler",
|
| 57 |
+
],
|
| 58 |
+
"_inference_endpoints": [
|
| 59 |
+
"InferenceEndpoint",
|
| 60 |
+
"InferenceEndpointError",
|
| 61 |
+
"InferenceEndpointStatus",
|
| 62 |
+
"InferenceEndpointTimeoutError",
|
| 63 |
+
"InferenceEndpointType",
|
| 64 |
+
],
|
| 65 |
+
"_jobs_api": [
|
| 66 |
+
"JobInfo",
|
| 67 |
+
"JobOwner",
|
| 68 |
+
"JobStage",
|
| 69 |
+
"JobStatus",
|
| 70 |
+
],
|
| 71 |
+
"_login": [
|
| 72 |
+
"auth_list",
|
| 73 |
+
"auth_switch",
|
| 74 |
+
"interpreter_login",
|
| 75 |
+
"login",
|
| 76 |
+
"logout",
|
| 77 |
+
"notebook_login",
|
| 78 |
+
],
|
| 79 |
+
"_oauth": [
|
| 80 |
+
"OAuthInfo",
|
| 81 |
+
"OAuthOrgInfo",
|
| 82 |
+
"OAuthUserInfo",
|
| 83 |
+
"attach_huggingface_oauth",
|
| 84 |
+
"parse_huggingface_oauth",
|
| 85 |
+
],
|
| 86 |
+
"_snapshot_download": [
|
| 87 |
+
"snapshot_download",
|
| 88 |
+
],
|
| 89 |
+
"_space_api": [
|
| 90 |
+
"SpaceHardware",
|
| 91 |
+
"SpaceRuntime",
|
| 92 |
+
"SpaceStage",
|
| 93 |
+
"SpaceStorage",
|
| 94 |
+
"SpaceVariable",
|
| 95 |
+
],
|
| 96 |
+
"_tensorboard_logger": [
|
| 97 |
+
"HFSummaryWriter",
|
| 98 |
+
],
|
| 99 |
+
"_webhooks_payload": [
|
| 100 |
+
"WebhookPayload",
|
| 101 |
+
"WebhookPayloadComment",
|
| 102 |
+
"WebhookPayloadDiscussion",
|
| 103 |
+
"WebhookPayloadDiscussionChanges",
|
| 104 |
+
"WebhookPayloadEvent",
|
| 105 |
+
"WebhookPayloadMovedTo",
|
| 106 |
+
"WebhookPayloadRepo",
|
| 107 |
+
"WebhookPayloadUrl",
|
| 108 |
+
"WebhookPayloadWebhook",
|
| 109 |
+
],
|
| 110 |
+
"_webhooks_server": [
|
| 111 |
+
"WebhooksServer",
|
| 112 |
+
"webhook_endpoint",
|
| 113 |
+
],
|
| 114 |
+
"community": [
|
| 115 |
+
"Discussion",
|
| 116 |
+
"DiscussionComment",
|
| 117 |
+
"DiscussionCommit",
|
| 118 |
+
"DiscussionEvent",
|
| 119 |
+
"DiscussionStatusChange",
|
| 120 |
+
"DiscussionTitleChange",
|
| 121 |
+
"DiscussionWithDetails",
|
| 122 |
+
],
|
| 123 |
+
"constants": [
|
| 124 |
+
"CONFIG_NAME",
|
| 125 |
+
"FLAX_WEIGHTS_NAME",
|
| 126 |
+
"HUGGINGFACE_CO_URL_HOME",
|
| 127 |
+
"HUGGINGFACE_CO_URL_TEMPLATE",
|
| 128 |
+
"PYTORCH_WEIGHTS_NAME",
|
| 129 |
+
"REPO_TYPE_DATASET",
|
| 130 |
+
"REPO_TYPE_MODEL",
|
| 131 |
+
"REPO_TYPE_SPACE",
|
| 132 |
+
"TF2_WEIGHTS_NAME",
|
| 133 |
+
"TF_WEIGHTS_NAME",
|
| 134 |
+
],
|
| 135 |
+
"fastai_utils": [
|
| 136 |
+
"_save_pretrained_fastai",
|
| 137 |
+
"from_pretrained_fastai",
|
| 138 |
+
"push_to_hub_fastai",
|
| 139 |
+
],
|
| 140 |
+
"file_download": [
|
| 141 |
+
"HfFileMetadata",
|
| 142 |
+
"_CACHED_NO_EXIST",
|
| 143 |
+
"get_hf_file_metadata",
|
| 144 |
+
"hf_hub_download",
|
| 145 |
+
"hf_hub_url",
|
| 146 |
+
"try_to_load_from_cache",
|
| 147 |
+
],
|
| 148 |
+
"hf_api": [
|
| 149 |
+
"Collection",
|
| 150 |
+
"CollectionItem",
|
| 151 |
+
"CommitInfo",
|
| 152 |
+
"CommitOperation",
|
| 153 |
+
"CommitOperationAdd",
|
| 154 |
+
"CommitOperationCopy",
|
| 155 |
+
"CommitOperationDelete",
|
| 156 |
+
"DatasetInfo",
|
| 157 |
+
"GitCommitInfo",
|
| 158 |
+
"GitRefInfo",
|
| 159 |
+
"GitRefs",
|
| 160 |
+
"HfApi",
|
| 161 |
+
"ModelInfo",
|
| 162 |
+
"Organization",
|
| 163 |
+
"RepoUrl",
|
| 164 |
+
"SpaceInfo",
|
| 165 |
+
"User",
|
| 166 |
+
"UserLikes",
|
| 167 |
+
"WebhookInfo",
|
| 168 |
+
"WebhookWatchedItem",
|
| 169 |
+
"accept_access_request",
|
| 170 |
+
"add_collection_item",
|
| 171 |
+
"add_space_secret",
|
| 172 |
+
"add_space_variable",
|
| 173 |
+
"auth_check",
|
| 174 |
+
"cancel_access_request",
|
| 175 |
+
"cancel_job",
|
| 176 |
+
"change_discussion_status",
|
| 177 |
+
"comment_discussion",
|
| 178 |
+
"create_branch",
|
| 179 |
+
"create_collection",
|
| 180 |
+
"create_commit",
|
| 181 |
+
"create_discussion",
|
| 182 |
+
"create_inference_endpoint",
|
| 183 |
+
"create_inference_endpoint_from_catalog",
|
| 184 |
+
"create_pull_request",
|
| 185 |
+
"create_repo",
|
| 186 |
+
"create_scheduled_job",
|
| 187 |
+
"create_scheduled_uv_job",
|
| 188 |
+
"create_tag",
|
| 189 |
+
"create_webhook",
|
| 190 |
+
"dataset_info",
|
| 191 |
+
"delete_branch",
|
| 192 |
+
"delete_collection",
|
| 193 |
+
"delete_collection_item",
|
| 194 |
+
"delete_file",
|
| 195 |
+
"delete_folder",
|
| 196 |
+
"delete_inference_endpoint",
|
| 197 |
+
"delete_repo",
|
| 198 |
+
"delete_scheduled_job",
|
| 199 |
+
"delete_space_secret",
|
| 200 |
+
"delete_space_storage",
|
| 201 |
+
"delete_space_variable",
|
| 202 |
+
"delete_tag",
|
| 203 |
+
"delete_webhook",
|
| 204 |
+
"disable_webhook",
|
| 205 |
+
"duplicate_space",
|
| 206 |
+
"edit_discussion_comment",
|
| 207 |
+
"enable_webhook",
|
| 208 |
+
"fetch_job_logs",
|
| 209 |
+
"file_exists",
|
| 210 |
+
"get_collection",
|
| 211 |
+
"get_dataset_tags",
|
| 212 |
+
"get_discussion_details",
|
| 213 |
+
"get_full_repo_name",
|
| 214 |
+
"get_inference_endpoint",
|
| 215 |
+
"get_model_tags",
|
| 216 |
+
"get_organization_overview",
|
| 217 |
+
"get_paths_info",
|
| 218 |
+
"get_repo_discussions",
|
| 219 |
+
"get_safetensors_metadata",
|
| 220 |
+
"get_space_runtime",
|
| 221 |
+
"get_space_variables",
|
| 222 |
+
"get_token_permission",
|
| 223 |
+
"get_user_overview",
|
| 224 |
+
"get_webhook",
|
| 225 |
+
"grant_access",
|
| 226 |
+
"inspect_job",
|
| 227 |
+
"inspect_scheduled_job",
|
| 228 |
+
"list_accepted_access_requests",
|
| 229 |
+
"list_collections",
|
| 230 |
+
"list_datasets",
|
| 231 |
+
"list_inference_catalog",
|
| 232 |
+
"list_inference_endpoints",
|
| 233 |
+
"list_jobs",
|
| 234 |
+
"list_lfs_files",
|
| 235 |
+
"list_liked_repos",
|
| 236 |
+
"list_models",
|
| 237 |
+
"list_organization_members",
|
| 238 |
+
"list_papers",
|
| 239 |
+
"list_pending_access_requests",
|
| 240 |
+
"list_rejected_access_requests",
|
| 241 |
+
"list_repo_commits",
|
| 242 |
+
"list_repo_files",
|
| 243 |
+
"list_repo_likers",
|
| 244 |
+
"list_repo_refs",
|
| 245 |
+
"list_repo_tree",
|
| 246 |
+
"list_spaces",
|
| 247 |
+
"list_user_followers",
|
| 248 |
+
"list_user_following",
|
| 249 |
+
"list_webhooks",
|
| 250 |
+
"merge_pull_request",
|
| 251 |
+
"model_info",
|
| 252 |
+
"move_repo",
|
| 253 |
+
"paper_info",
|
| 254 |
+
"parse_safetensors_file_metadata",
|
| 255 |
+
"pause_inference_endpoint",
|
| 256 |
+
"pause_space",
|
| 257 |
+
"permanently_delete_lfs_files",
|
| 258 |
+
"preupload_lfs_files",
|
| 259 |
+
"reject_access_request",
|
| 260 |
+
"rename_discussion",
|
| 261 |
+
"repo_exists",
|
| 262 |
+
"repo_info",
|
| 263 |
+
"repo_type_and_id_from_hf_id",
|
| 264 |
+
"request_space_hardware",
|
| 265 |
+
"request_space_storage",
|
| 266 |
+
"restart_space",
|
| 267 |
+
"resume_inference_endpoint",
|
| 268 |
+
"resume_scheduled_job",
|
| 269 |
+
"revision_exists",
|
| 270 |
+
"run_as_future",
|
| 271 |
+
"run_job",
|
| 272 |
+
"run_uv_job",
|
| 273 |
+
"scale_to_zero_inference_endpoint",
|
| 274 |
+
"set_space_sleep_time",
|
| 275 |
+
"space_info",
|
| 276 |
+
"super_squash_history",
|
| 277 |
+
"suspend_scheduled_job",
|
| 278 |
+
"unlike",
|
| 279 |
+
"update_collection_item",
|
| 280 |
+
"update_collection_metadata",
|
| 281 |
+
"update_inference_endpoint",
|
| 282 |
+
"update_repo_settings",
|
| 283 |
+
"update_repo_visibility",
|
| 284 |
+
"update_webhook",
|
| 285 |
+
"upload_file",
|
| 286 |
+
"upload_folder",
|
| 287 |
+
"upload_large_folder",
|
| 288 |
+
"whoami",
|
| 289 |
+
],
|
| 290 |
+
"hf_file_system": [
|
| 291 |
+
"HfFileSystem",
|
| 292 |
+
"HfFileSystemFile",
|
| 293 |
+
"HfFileSystemResolvedPath",
|
| 294 |
+
"HfFileSystemStreamFile",
|
| 295 |
+
],
|
| 296 |
+
"hub_mixin": [
|
| 297 |
+
"ModelHubMixin",
|
| 298 |
+
"PyTorchModelHubMixin",
|
| 299 |
+
],
|
| 300 |
+
"inference._client": [
|
| 301 |
+
"InferenceClient",
|
| 302 |
+
"InferenceTimeoutError",
|
| 303 |
+
],
|
| 304 |
+
"inference._generated._async_client": [
|
| 305 |
+
"AsyncInferenceClient",
|
| 306 |
+
],
|
| 307 |
+
"inference._generated.types": [
|
| 308 |
+
"AudioClassificationInput",
|
| 309 |
+
"AudioClassificationOutputElement",
|
| 310 |
+
"AudioClassificationOutputTransform",
|
| 311 |
+
"AudioClassificationParameters",
|
| 312 |
+
"AudioToAudioInput",
|
| 313 |
+
"AudioToAudioOutputElement",
|
| 314 |
+
"AutomaticSpeechRecognitionEarlyStoppingEnum",
|
| 315 |
+
"AutomaticSpeechRecognitionGenerationParameters",
|
| 316 |
+
"AutomaticSpeechRecognitionInput",
|
| 317 |
+
"AutomaticSpeechRecognitionOutput",
|
| 318 |
+
"AutomaticSpeechRecognitionOutputChunk",
|
| 319 |
+
"AutomaticSpeechRecognitionParameters",
|
| 320 |
+
"ChatCompletionInput",
|
| 321 |
+
"ChatCompletionInputFunctionDefinition",
|
| 322 |
+
"ChatCompletionInputFunctionName",
|
| 323 |
+
"ChatCompletionInputGrammarType",
|
| 324 |
+
"ChatCompletionInputJSONSchema",
|
| 325 |
+
"ChatCompletionInputMessage",
|
| 326 |
+
"ChatCompletionInputMessageChunk",
|
| 327 |
+
"ChatCompletionInputMessageChunkType",
|
| 328 |
+
"ChatCompletionInputResponseFormatJSONObject",
|
| 329 |
+
"ChatCompletionInputResponseFormatJSONSchema",
|
| 330 |
+
"ChatCompletionInputResponseFormatText",
|
| 331 |
+
"ChatCompletionInputStreamOptions",
|
| 332 |
+
"ChatCompletionInputTool",
|
| 333 |
+
"ChatCompletionInputToolCall",
|
| 334 |
+
"ChatCompletionInputToolChoiceClass",
|
| 335 |
+
"ChatCompletionInputToolChoiceEnum",
|
| 336 |
+
"ChatCompletionInputURL",
|
| 337 |
+
"ChatCompletionOutput",
|
| 338 |
+
"ChatCompletionOutputComplete",
|
| 339 |
+
"ChatCompletionOutputFunctionDefinition",
|
| 340 |
+
"ChatCompletionOutputLogprob",
|
| 341 |
+
"ChatCompletionOutputLogprobs",
|
| 342 |
+
"ChatCompletionOutputMessage",
|
| 343 |
+
"ChatCompletionOutputToolCall",
|
| 344 |
+
"ChatCompletionOutputTopLogprob",
|
| 345 |
+
"ChatCompletionOutputUsage",
|
| 346 |
+
"ChatCompletionStreamOutput",
|
| 347 |
+
"ChatCompletionStreamOutputChoice",
|
| 348 |
+
"ChatCompletionStreamOutputDelta",
|
| 349 |
+
"ChatCompletionStreamOutputDeltaToolCall",
|
| 350 |
+
"ChatCompletionStreamOutputFunction",
|
| 351 |
+
"ChatCompletionStreamOutputLogprob",
|
| 352 |
+
"ChatCompletionStreamOutputLogprobs",
|
| 353 |
+
"ChatCompletionStreamOutputTopLogprob",
|
| 354 |
+
"ChatCompletionStreamOutputUsage",
|
| 355 |
+
"DepthEstimationInput",
|
| 356 |
+
"DepthEstimationOutput",
|
| 357 |
+
"DocumentQuestionAnsweringInput",
|
| 358 |
+
"DocumentQuestionAnsweringInputData",
|
| 359 |
+
"DocumentQuestionAnsweringOutputElement",
|
| 360 |
+
"DocumentQuestionAnsweringParameters",
|
| 361 |
+
"FeatureExtractionInput",
|
| 362 |
+
"FeatureExtractionInputTruncationDirection",
|
| 363 |
+
"FillMaskInput",
|
| 364 |
+
"FillMaskOutputElement",
|
| 365 |
+
"FillMaskParameters",
|
| 366 |
+
"ImageClassificationInput",
|
| 367 |
+
"ImageClassificationOutputElement",
|
| 368 |
+
"ImageClassificationOutputTransform",
|
| 369 |
+
"ImageClassificationParameters",
|
| 370 |
+
"ImageSegmentationInput",
|
| 371 |
+
"ImageSegmentationOutputElement",
|
| 372 |
+
"ImageSegmentationParameters",
|
| 373 |
+
"ImageSegmentationSubtask",
|
| 374 |
+
"ImageToImageInput",
|
| 375 |
+
"ImageToImageOutput",
|
| 376 |
+
"ImageToImageParameters",
|
| 377 |
+
"ImageToImageTargetSize",
|
| 378 |
+
"ImageToTextEarlyStoppingEnum",
|
| 379 |
+
"ImageToTextGenerationParameters",
|
| 380 |
+
"ImageToTextInput",
|
| 381 |
+
"ImageToTextOutput",
|
| 382 |
+
"ImageToTextParameters",
|
| 383 |
+
"ImageToVideoInput",
|
| 384 |
+
"ImageToVideoOutput",
|
| 385 |
+
"ImageToVideoParameters",
|
| 386 |
+
"ImageToVideoTargetSize",
|
| 387 |
+
"ObjectDetectionBoundingBox",
|
| 388 |
+
"ObjectDetectionInput",
|
| 389 |
+
"ObjectDetectionOutputElement",
|
| 390 |
+
"ObjectDetectionParameters",
|
| 391 |
+
"Padding",
|
| 392 |
+
"QuestionAnsweringInput",
|
| 393 |
+
"QuestionAnsweringInputData",
|
| 394 |
+
"QuestionAnsweringOutputElement",
|
| 395 |
+
"QuestionAnsweringParameters",
|
| 396 |
+
"SentenceSimilarityInput",
|
| 397 |
+
"SentenceSimilarityInputData",
|
| 398 |
+
"SummarizationInput",
|
| 399 |
+
"SummarizationOutput",
|
| 400 |
+
"SummarizationParameters",
|
| 401 |
+
"SummarizationTruncationStrategy",
|
| 402 |
+
"TableQuestionAnsweringInput",
|
| 403 |
+
"TableQuestionAnsweringInputData",
|
| 404 |
+
"TableQuestionAnsweringOutputElement",
|
| 405 |
+
"TableQuestionAnsweringParameters",
|
| 406 |
+
"Text2TextGenerationInput",
|
| 407 |
+
"Text2TextGenerationOutput",
|
| 408 |
+
"Text2TextGenerationParameters",
|
| 409 |
+
"Text2TextGenerationTruncationStrategy",
|
| 410 |
+
"TextClassificationInput",
|
| 411 |
+
"TextClassificationOutputElement",
|
| 412 |
+
"TextClassificationOutputTransform",
|
| 413 |
+
"TextClassificationParameters",
|
| 414 |
+
"TextGenerationInput",
|
| 415 |
+
"TextGenerationInputGenerateParameters",
|
| 416 |
+
"TextGenerationInputGrammarType",
|
| 417 |
+
"TextGenerationOutput",
|
| 418 |
+
"TextGenerationOutputBestOfSequence",
|
| 419 |
+
"TextGenerationOutputDetails",
|
| 420 |
+
"TextGenerationOutputFinishReason",
|
| 421 |
+
"TextGenerationOutputPrefillToken",
|
| 422 |
+
"TextGenerationOutputToken",
|
| 423 |
+
"TextGenerationStreamOutput",
|
| 424 |
+
"TextGenerationStreamOutputStreamDetails",
|
| 425 |
+
"TextGenerationStreamOutputToken",
|
| 426 |
+
"TextToAudioEarlyStoppingEnum",
|
| 427 |
+
"TextToAudioGenerationParameters",
|
| 428 |
+
"TextToAudioInput",
|
| 429 |
+
"TextToAudioOutput",
|
| 430 |
+
"TextToAudioParameters",
|
| 431 |
+
"TextToImageInput",
|
| 432 |
+
"TextToImageOutput",
|
| 433 |
+
"TextToImageParameters",
|
| 434 |
+
"TextToSpeechEarlyStoppingEnum",
|
| 435 |
+
"TextToSpeechGenerationParameters",
|
| 436 |
+
"TextToSpeechInput",
|
| 437 |
+
"TextToSpeechOutput",
|
| 438 |
+
"TextToSpeechParameters",
|
| 439 |
+
"TextToVideoInput",
|
| 440 |
+
"TextToVideoOutput",
|
| 441 |
+
"TextToVideoParameters",
|
| 442 |
+
"TokenClassificationAggregationStrategy",
|
| 443 |
+
"TokenClassificationInput",
|
| 444 |
+
"TokenClassificationOutputElement",
|
| 445 |
+
"TokenClassificationParameters",
|
| 446 |
+
"TranslationInput",
|
| 447 |
+
"TranslationOutput",
|
| 448 |
+
"TranslationParameters",
|
| 449 |
+
"TranslationTruncationStrategy",
|
| 450 |
+
"TypeEnum",
|
| 451 |
+
"VideoClassificationInput",
|
| 452 |
+
"VideoClassificationOutputElement",
|
| 453 |
+
"VideoClassificationOutputTransform",
|
| 454 |
+
"VideoClassificationParameters",
|
| 455 |
+
"VisualQuestionAnsweringInput",
|
| 456 |
+
"VisualQuestionAnsweringInputData",
|
| 457 |
+
"VisualQuestionAnsweringOutputElement",
|
| 458 |
+
"VisualQuestionAnsweringParameters",
|
| 459 |
+
"ZeroShotClassificationInput",
|
| 460 |
+
"ZeroShotClassificationOutputElement",
|
| 461 |
+
"ZeroShotClassificationParameters",
|
| 462 |
+
"ZeroShotImageClassificationInput",
|
| 463 |
+
"ZeroShotImageClassificationOutputElement",
|
| 464 |
+
"ZeroShotImageClassificationParameters",
|
| 465 |
+
"ZeroShotObjectDetectionBoundingBox",
|
| 466 |
+
"ZeroShotObjectDetectionInput",
|
| 467 |
+
"ZeroShotObjectDetectionOutputElement",
|
| 468 |
+
"ZeroShotObjectDetectionParameters",
|
| 469 |
+
],
|
| 470 |
+
"inference._mcp.agent": [
|
| 471 |
+
"Agent",
|
| 472 |
+
],
|
| 473 |
+
"inference._mcp.mcp_client": [
|
| 474 |
+
"MCPClient",
|
| 475 |
+
],
|
| 476 |
+
"inference_api": [
|
| 477 |
+
"InferenceApi",
|
| 478 |
+
],
|
| 479 |
+
"keras_mixin": [
|
| 480 |
+
"KerasModelHubMixin",
|
| 481 |
+
"from_pretrained_keras",
|
| 482 |
+
"push_to_hub_keras",
|
| 483 |
+
"save_pretrained_keras",
|
| 484 |
+
],
|
| 485 |
+
"repocard": [
|
| 486 |
+
"DatasetCard",
|
| 487 |
+
"ModelCard",
|
| 488 |
+
"RepoCard",
|
| 489 |
+
"SpaceCard",
|
| 490 |
+
"metadata_eval_result",
|
| 491 |
+
"metadata_load",
|
| 492 |
+
"metadata_save",
|
| 493 |
+
"metadata_update",
|
| 494 |
+
],
|
| 495 |
+
"repocard_data": [
|
| 496 |
+
"CardData",
|
| 497 |
+
"DatasetCardData",
|
| 498 |
+
"EvalResult",
|
| 499 |
+
"ModelCardData",
|
| 500 |
+
"SpaceCardData",
|
| 501 |
+
],
|
| 502 |
+
"repository": [
|
| 503 |
+
"Repository",
|
| 504 |
+
],
|
| 505 |
+
"serialization": [
|
| 506 |
+
"StateDictSplit",
|
| 507 |
+
"get_tf_storage_size",
|
| 508 |
+
"get_torch_storage_id",
|
| 509 |
+
"get_torch_storage_size",
|
| 510 |
+
"load_state_dict_from_file",
|
| 511 |
+
"load_torch_model",
|
| 512 |
+
"save_torch_model",
|
| 513 |
+
"save_torch_state_dict",
|
| 514 |
+
"split_state_dict_into_shards_factory",
|
| 515 |
+
"split_tf_state_dict_into_shards",
|
| 516 |
+
"split_torch_state_dict_into_shards",
|
| 517 |
+
],
|
| 518 |
+
"serialization._dduf": [
|
| 519 |
+
"DDUFEntry",
|
| 520 |
+
"export_entries_as_dduf",
|
| 521 |
+
"export_folder_as_dduf",
|
| 522 |
+
"read_dduf_file",
|
| 523 |
+
],
|
| 524 |
+
"utils": [
|
| 525 |
+
"CacheNotFound",
|
| 526 |
+
"CachedFileInfo",
|
| 527 |
+
"CachedRepoInfo",
|
| 528 |
+
"CachedRevisionInfo",
|
| 529 |
+
"CorruptedCacheException",
|
| 530 |
+
"DeleteCacheStrategy",
|
| 531 |
+
"HFCacheInfo",
|
| 532 |
+
"HfFolder",
|
| 533 |
+
"cached_assets_path",
|
| 534 |
+
"configure_http_backend",
|
| 535 |
+
"dump_environment_info",
|
| 536 |
+
"get_session",
|
| 537 |
+
"get_token",
|
| 538 |
+
"logging",
|
| 539 |
+
"scan_cache_dir",
|
| 540 |
+
],
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
# WARNING: __all__ is generated automatically, Any manual edit will be lost when re-generating this file !
|
| 544 |
+
#
|
| 545 |
+
# To update the static imports, please run the following command and commit the changes.
|
| 546 |
+
# ```
|
| 547 |
+
# # Use script
|
| 548 |
+
# python utils/check_all_variable.py --update
|
| 549 |
+
#
|
| 550 |
+
# # Or run style on codebase
|
| 551 |
+
# make style
|
| 552 |
+
# ```
|
| 553 |
+
|
| 554 |
+
__all__ = [
|
| 555 |
+
"Agent",
|
| 556 |
+
"AsyncInferenceClient",
|
| 557 |
+
"AudioClassificationInput",
|
| 558 |
+
"AudioClassificationOutputElement",
|
| 559 |
+
"AudioClassificationOutputTransform",
|
| 560 |
+
"AudioClassificationParameters",
|
| 561 |
+
"AudioToAudioInput",
|
| 562 |
+
"AudioToAudioOutputElement",
|
| 563 |
+
"AutomaticSpeechRecognitionEarlyStoppingEnum",
|
| 564 |
+
"AutomaticSpeechRecognitionGenerationParameters",
|
| 565 |
+
"AutomaticSpeechRecognitionInput",
|
| 566 |
+
"AutomaticSpeechRecognitionOutput",
|
| 567 |
+
"AutomaticSpeechRecognitionOutputChunk",
|
| 568 |
+
"AutomaticSpeechRecognitionParameters",
|
| 569 |
+
"CONFIG_NAME",
|
| 570 |
+
"CacheNotFound",
|
| 571 |
+
"CachedFileInfo",
|
| 572 |
+
"CachedRepoInfo",
|
| 573 |
+
"CachedRevisionInfo",
|
| 574 |
+
"CardData",
|
| 575 |
+
"ChatCompletionInput",
|
| 576 |
+
"ChatCompletionInputFunctionDefinition",
|
| 577 |
+
"ChatCompletionInputFunctionName",
|
| 578 |
+
"ChatCompletionInputGrammarType",
|
| 579 |
+
"ChatCompletionInputJSONSchema",
|
| 580 |
+
"ChatCompletionInputMessage",
|
| 581 |
+
"ChatCompletionInputMessageChunk",
|
| 582 |
+
"ChatCompletionInputMessageChunkType",
|
| 583 |
+
"ChatCompletionInputResponseFormatJSONObject",
|
| 584 |
+
"ChatCompletionInputResponseFormatJSONSchema",
|
| 585 |
+
"ChatCompletionInputResponseFormatText",
|
| 586 |
+
"ChatCompletionInputStreamOptions",
|
| 587 |
+
"ChatCompletionInputTool",
|
| 588 |
+
"ChatCompletionInputToolCall",
|
| 589 |
+
"ChatCompletionInputToolChoiceClass",
|
| 590 |
+
"ChatCompletionInputToolChoiceEnum",
|
| 591 |
+
"ChatCompletionInputURL",
|
| 592 |
+
"ChatCompletionOutput",
|
| 593 |
+
"ChatCompletionOutputComplete",
|
| 594 |
+
"ChatCompletionOutputFunctionDefinition",
|
| 595 |
+
"ChatCompletionOutputLogprob",
|
| 596 |
+
"ChatCompletionOutputLogprobs",
|
| 597 |
+
"ChatCompletionOutputMessage",
|
| 598 |
+
"ChatCompletionOutputToolCall",
|
| 599 |
+
"ChatCompletionOutputTopLogprob",
|
| 600 |
+
"ChatCompletionOutputUsage",
|
| 601 |
+
"ChatCompletionStreamOutput",
|
| 602 |
+
"ChatCompletionStreamOutputChoice",
|
| 603 |
+
"ChatCompletionStreamOutputDelta",
|
| 604 |
+
"ChatCompletionStreamOutputDeltaToolCall",
|
| 605 |
+
"ChatCompletionStreamOutputFunction",
|
| 606 |
+
"ChatCompletionStreamOutputLogprob",
|
| 607 |
+
"ChatCompletionStreamOutputLogprobs",
|
| 608 |
+
"ChatCompletionStreamOutputTopLogprob",
|
| 609 |
+
"ChatCompletionStreamOutputUsage",
|
| 610 |
+
"Collection",
|
| 611 |
+
"CollectionItem",
|
| 612 |
+
"CommitInfo",
|
| 613 |
+
"CommitOperation",
|
| 614 |
+
"CommitOperationAdd",
|
| 615 |
+
"CommitOperationCopy",
|
| 616 |
+
"CommitOperationDelete",
|
| 617 |
+
"CommitScheduler",
|
| 618 |
+
"CorruptedCacheException",
|
| 619 |
+
"DDUFEntry",
|
| 620 |
+
"DatasetCard",
|
| 621 |
+
"DatasetCardData",
|
| 622 |
+
"DatasetInfo",
|
| 623 |
+
"DeleteCacheStrategy",
|
| 624 |
+
"DepthEstimationInput",
|
| 625 |
+
"DepthEstimationOutput",
|
| 626 |
+
"Discussion",
|
| 627 |
+
"DiscussionComment",
|
| 628 |
+
"DiscussionCommit",
|
| 629 |
+
"DiscussionEvent",
|
| 630 |
+
"DiscussionStatusChange",
|
| 631 |
+
"DiscussionTitleChange",
|
| 632 |
+
"DiscussionWithDetails",
|
| 633 |
+
"DocumentQuestionAnsweringInput",
|
| 634 |
+
"DocumentQuestionAnsweringInputData",
|
| 635 |
+
"DocumentQuestionAnsweringOutputElement",
|
| 636 |
+
"DocumentQuestionAnsweringParameters",
|
| 637 |
+
"EvalResult",
|
| 638 |
+
"FLAX_WEIGHTS_NAME",
|
| 639 |
+
"FeatureExtractionInput",
|
| 640 |
+
"FeatureExtractionInputTruncationDirection",
|
| 641 |
+
"FillMaskInput",
|
| 642 |
+
"FillMaskOutputElement",
|
| 643 |
+
"FillMaskParameters",
|
| 644 |
+
"GitCommitInfo",
|
| 645 |
+
"GitRefInfo",
|
| 646 |
+
"GitRefs",
|
| 647 |
+
"HFCacheInfo",
|
| 648 |
+
"HFSummaryWriter",
|
| 649 |
+
"HUGGINGFACE_CO_URL_HOME",
|
| 650 |
+
"HUGGINGFACE_CO_URL_TEMPLATE",
|
| 651 |
+
"HfApi",
|
| 652 |
+
"HfFileMetadata",
|
| 653 |
+
"HfFileSystem",
|
| 654 |
+
"HfFileSystemFile",
|
| 655 |
+
"HfFileSystemResolvedPath",
|
| 656 |
+
"HfFileSystemStreamFile",
|
| 657 |
+
"HfFolder",
|
| 658 |
+
"ImageClassificationInput",
|
| 659 |
+
"ImageClassificationOutputElement",
|
| 660 |
+
"ImageClassificationOutputTransform",
|
| 661 |
+
"ImageClassificationParameters",
|
| 662 |
+
"ImageSegmentationInput",
|
| 663 |
+
"ImageSegmentationOutputElement",
|
| 664 |
+
"ImageSegmentationParameters",
|
| 665 |
+
"ImageSegmentationSubtask",
|
| 666 |
+
"ImageToImageInput",
|
| 667 |
+
"ImageToImageOutput",
|
| 668 |
+
"ImageToImageParameters",
|
| 669 |
+
"ImageToImageTargetSize",
|
| 670 |
+
"ImageToTextEarlyStoppingEnum",
|
| 671 |
+
"ImageToTextGenerationParameters",
|
| 672 |
+
"ImageToTextInput",
|
| 673 |
+
"ImageToTextOutput",
|
| 674 |
+
"ImageToTextParameters",
|
| 675 |
+
"ImageToVideoInput",
|
| 676 |
+
"ImageToVideoOutput",
|
| 677 |
+
"ImageToVideoParameters",
|
| 678 |
+
"ImageToVideoTargetSize",
|
| 679 |
+
"InferenceApi",
|
| 680 |
+
"InferenceClient",
|
| 681 |
+
"InferenceEndpoint",
|
| 682 |
+
"InferenceEndpointError",
|
| 683 |
+
"InferenceEndpointStatus",
|
| 684 |
+
"InferenceEndpointTimeoutError",
|
| 685 |
+
"InferenceEndpointType",
|
| 686 |
+
"InferenceTimeoutError",
|
| 687 |
+
"JobInfo",
|
| 688 |
+
"JobOwner",
|
| 689 |
+
"JobStage",
|
| 690 |
+
"JobStatus",
|
| 691 |
+
"KerasModelHubMixin",
|
| 692 |
+
"MCPClient",
|
| 693 |
+
"ModelCard",
|
| 694 |
+
"ModelCardData",
|
| 695 |
+
"ModelHubMixin",
|
| 696 |
+
"ModelInfo",
|
| 697 |
+
"OAuthInfo",
|
| 698 |
+
"OAuthOrgInfo",
|
| 699 |
+
"OAuthUserInfo",
|
| 700 |
+
"ObjectDetectionBoundingBox",
|
| 701 |
+
"ObjectDetectionInput",
|
| 702 |
+
"ObjectDetectionOutputElement",
|
| 703 |
+
"ObjectDetectionParameters",
|
| 704 |
+
"Organization",
|
| 705 |
+
"PYTORCH_WEIGHTS_NAME",
|
| 706 |
+
"Padding",
|
| 707 |
+
"PyTorchModelHubMixin",
|
| 708 |
+
"QuestionAnsweringInput",
|
| 709 |
+
"QuestionAnsweringInputData",
|
| 710 |
+
"QuestionAnsweringOutputElement",
|
| 711 |
+
"QuestionAnsweringParameters",
|
| 712 |
+
"REPO_TYPE_DATASET",
|
| 713 |
+
"REPO_TYPE_MODEL",
|
| 714 |
+
"REPO_TYPE_SPACE",
|
| 715 |
+
"RepoCard",
|
| 716 |
+
"RepoUrl",
|
| 717 |
+
"Repository",
|
| 718 |
+
"SentenceSimilarityInput",
|
| 719 |
+
"SentenceSimilarityInputData",
|
| 720 |
+
"SpaceCard",
|
| 721 |
+
"SpaceCardData",
|
| 722 |
+
"SpaceHardware",
|
| 723 |
+
"SpaceInfo",
|
| 724 |
+
"SpaceRuntime",
|
| 725 |
+
"SpaceStage",
|
| 726 |
+
"SpaceStorage",
|
| 727 |
+
"SpaceVariable",
|
| 728 |
+
"StateDictSplit",
|
| 729 |
+
"SummarizationInput",
|
| 730 |
+
"SummarizationOutput",
|
| 731 |
+
"SummarizationParameters",
|
| 732 |
+
"SummarizationTruncationStrategy",
|
| 733 |
+
"TF2_WEIGHTS_NAME",
|
| 734 |
+
"TF_WEIGHTS_NAME",
|
| 735 |
+
"TableQuestionAnsweringInput",
|
| 736 |
+
"TableQuestionAnsweringInputData",
|
| 737 |
+
"TableQuestionAnsweringOutputElement",
|
| 738 |
+
"TableQuestionAnsweringParameters",
|
| 739 |
+
"Text2TextGenerationInput",
|
| 740 |
+
"Text2TextGenerationOutput",
|
| 741 |
+
"Text2TextGenerationParameters",
|
| 742 |
+
"Text2TextGenerationTruncationStrategy",
|
| 743 |
+
"TextClassificationInput",
|
| 744 |
+
"TextClassificationOutputElement",
|
| 745 |
+
"TextClassificationOutputTransform",
|
| 746 |
+
"TextClassificationParameters",
|
| 747 |
+
"TextGenerationInput",
|
| 748 |
+
"TextGenerationInputGenerateParameters",
|
| 749 |
+
"TextGenerationInputGrammarType",
|
| 750 |
+
"TextGenerationOutput",
|
| 751 |
+
"TextGenerationOutputBestOfSequence",
|
| 752 |
+
"TextGenerationOutputDetails",
|
| 753 |
+
"TextGenerationOutputFinishReason",
|
| 754 |
+
"TextGenerationOutputPrefillToken",
|
| 755 |
+
"TextGenerationOutputToken",
|
| 756 |
+
"TextGenerationStreamOutput",
|
| 757 |
+
"TextGenerationStreamOutputStreamDetails",
|
| 758 |
+
"TextGenerationStreamOutputToken",
|
| 759 |
+
"TextToAudioEarlyStoppingEnum",
|
| 760 |
+
"TextToAudioGenerationParameters",
|
| 761 |
+
"TextToAudioInput",
|
| 762 |
+
"TextToAudioOutput",
|
| 763 |
+
"TextToAudioParameters",
|
| 764 |
+
"TextToImageInput",
|
| 765 |
+
"TextToImageOutput",
|
| 766 |
+
"TextToImageParameters",
|
| 767 |
+
"TextToSpeechEarlyStoppingEnum",
|
| 768 |
+
"TextToSpeechGenerationParameters",
|
| 769 |
+
"TextToSpeechInput",
|
| 770 |
+
"TextToSpeechOutput",
|
| 771 |
+
"TextToSpeechParameters",
|
| 772 |
+
"TextToVideoInput",
|
| 773 |
+
"TextToVideoOutput",
|
| 774 |
+
"TextToVideoParameters",
|
| 775 |
+
"TokenClassificationAggregationStrategy",
|
| 776 |
+
"TokenClassificationInput",
|
| 777 |
+
"TokenClassificationOutputElement",
|
| 778 |
+
"TokenClassificationParameters",
|
| 779 |
+
"TranslationInput",
|
| 780 |
+
"TranslationOutput",
|
| 781 |
+
"TranslationParameters",
|
| 782 |
+
"TranslationTruncationStrategy",
|
| 783 |
+
"TypeEnum",
|
| 784 |
+
"User",
|
| 785 |
+
"UserLikes",
|
| 786 |
+
"VideoClassificationInput",
|
| 787 |
+
"VideoClassificationOutputElement",
|
| 788 |
+
"VideoClassificationOutputTransform",
|
| 789 |
+
"VideoClassificationParameters",
|
| 790 |
+
"VisualQuestionAnsweringInput",
|
| 791 |
+
"VisualQuestionAnsweringInputData",
|
| 792 |
+
"VisualQuestionAnsweringOutputElement",
|
| 793 |
+
"VisualQuestionAnsweringParameters",
|
| 794 |
+
"WebhookInfo",
|
| 795 |
+
"WebhookPayload",
|
| 796 |
+
"WebhookPayloadComment",
|
| 797 |
+
"WebhookPayloadDiscussion",
|
| 798 |
+
"WebhookPayloadDiscussionChanges",
|
| 799 |
+
"WebhookPayloadEvent",
|
| 800 |
+
"WebhookPayloadMovedTo",
|
| 801 |
+
"WebhookPayloadRepo",
|
| 802 |
+
"WebhookPayloadUrl",
|
| 803 |
+
"WebhookPayloadWebhook",
|
| 804 |
+
"WebhookWatchedItem",
|
| 805 |
+
"WebhooksServer",
|
| 806 |
+
"ZeroShotClassificationInput",
|
| 807 |
+
"ZeroShotClassificationOutputElement",
|
| 808 |
+
"ZeroShotClassificationParameters",
|
| 809 |
+
"ZeroShotImageClassificationInput",
|
| 810 |
+
"ZeroShotImageClassificationOutputElement",
|
| 811 |
+
"ZeroShotImageClassificationParameters",
|
| 812 |
+
"ZeroShotObjectDetectionBoundingBox",
|
| 813 |
+
"ZeroShotObjectDetectionInput",
|
| 814 |
+
"ZeroShotObjectDetectionOutputElement",
|
| 815 |
+
"ZeroShotObjectDetectionParameters",
|
| 816 |
+
"_CACHED_NO_EXIST",
|
| 817 |
+
"_save_pretrained_fastai",
|
| 818 |
+
"accept_access_request",
|
| 819 |
+
"add_collection_item",
|
| 820 |
+
"add_space_secret",
|
| 821 |
+
"add_space_variable",
|
| 822 |
+
"attach_huggingface_oauth",
|
| 823 |
+
"auth_check",
|
| 824 |
+
"auth_list",
|
| 825 |
+
"auth_switch",
|
| 826 |
+
"cached_assets_path",
|
| 827 |
+
"cancel_access_request",
|
| 828 |
+
"cancel_job",
|
| 829 |
+
"change_discussion_status",
|
| 830 |
+
"comment_discussion",
|
| 831 |
+
"configure_http_backend",
|
| 832 |
+
"create_branch",
|
| 833 |
+
"create_collection",
|
| 834 |
+
"create_commit",
|
| 835 |
+
"create_discussion",
|
| 836 |
+
"create_inference_endpoint",
|
| 837 |
+
"create_inference_endpoint_from_catalog",
|
| 838 |
+
"create_pull_request",
|
| 839 |
+
"create_repo",
|
| 840 |
+
"create_scheduled_job",
|
| 841 |
+
"create_scheduled_uv_job",
|
| 842 |
+
"create_tag",
|
| 843 |
+
"create_webhook",
|
| 844 |
+
"dataset_info",
|
| 845 |
+
"delete_branch",
|
| 846 |
+
"delete_collection",
|
| 847 |
+
"delete_collection_item",
|
| 848 |
+
"delete_file",
|
| 849 |
+
"delete_folder",
|
| 850 |
+
"delete_inference_endpoint",
|
| 851 |
+
"delete_repo",
|
| 852 |
+
"delete_scheduled_job",
|
| 853 |
+
"delete_space_secret",
|
| 854 |
+
"delete_space_storage",
|
| 855 |
+
"delete_space_variable",
|
| 856 |
+
"delete_tag",
|
| 857 |
+
"delete_webhook",
|
| 858 |
+
"disable_webhook",
|
| 859 |
+
"dump_environment_info",
|
| 860 |
+
"duplicate_space",
|
| 861 |
+
"edit_discussion_comment",
|
| 862 |
+
"enable_webhook",
|
| 863 |
+
"export_entries_as_dduf",
|
| 864 |
+
"export_folder_as_dduf",
|
| 865 |
+
"fetch_job_logs",
|
| 866 |
+
"file_exists",
|
| 867 |
+
"from_pretrained_fastai",
|
| 868 |
+
"from_pretrained_keras",
|
| 869 |
+
"get_collection",
|
| 870 |
+
"get_dataset_tags",
|
| 871 |
+
"get_discussion_details",
|
| 872 |
+
"get_full_repo_name",
|
| 873 |
+
"get_hf_file_metadata",
|
| 874 |
+
"get_inference_endpoint",
|
| 875 |
+
"get_model_tags",
|
| 876 |
+
"get_organization_overview",
|
| 877 |
+
"get_paths_info",
|
| 878 |
+
"get_repo_discussions",
|
| 879 |
+
"get_safetensors_metadata",
|
| 880 |
+
"get_session",
|
| 881 |
+
"get_space_runtime",
|
| 882 |
+
"get_space_variables",
|
| 883 |
+
"get_tf_storage_size",
|
| 884 |
+
"get_token",
|
| 885 |
+
"get_token_permission",
|
| 886 |
+
"get_torch_storage_id",
|
| 887 |
+
"get_torch_storage_size",
|
| 888 |
+
"get_user_overview",
|
| 889 |
+
"get_webhook",
|
| 890 |
+
"grant_access",
|
| 891 |
+
"hf_hub_download",
|
| 892 |
+
"hf_hub_url",
|
| 893 |
+
"inspect_job",
|
| 894 |
+
"inspect_scheduled_job",
|
| 895 |
+
"interpreter_login",
|
| 896 |
+
"list_accepted_access_requests",
|
| 897 |
+
"list_collections",
|
| 898 |
+
"list_datasets",
|
| 899 |
+
"list_inference_catalog",
|
| 900 |
+
"list_inference_endpoints",
|
| 901 |
+
"list_jobs",
|
| 902 |
+
"list_lfs_files",
|
| 903 |
+
"list_liked_repos",
|
| 904 |
+
"list_models",
|
| 905 |
+
"list_organization_members",
|
| 906 |
+
"list_papers",
|
| 907 |
+
"list_pending_access_requests",
|
| 908 |
+
"list_rejected_access_requests",
|
| 909 |
+
"list_repo_commits",
|
| 910 |
+
"list_repo_files",
|
| 911 |
+
"list_repo_likers",
|
| 912 |
+
"list_repo_refs",
|
| 913 |
+
"list_repo_tree",
|
| 914 |
+
"list_spaces",
|
| 915 |
+
"list_user_followers",
|
| 916 |
+
"list_user_following",
|
| 917 |
+
"list_webhooks",
|
| 918 |
+
"load_state_dict_from_file",
|
| 919 |
+
"load_torch_model",
|
| 920 |
+
"logging",
|
| 921 |
+
"login",
|
| 922 |
+
"logout",
|
| 923 |
+
"merge_pull_request",
|
| 924 |
+
"metadata_eval_result",
|
| 925 |
+
"metadata_load",
|
| 926 |
+
"metadata_save",
|
| 927 |
+
"metadata_update",
|
| 928 |
+
"model_info",
|
| 929 |
+
"move_repo",
|
| 930 |
+
"notebook_login",
|
| 931 |
+
"paper_info",
|
| 932 |
+
"parse_huggingface_oauth",
|
| 933 |
+
"parse_safetensors_file_metadata",
|
| 934 |
+
"pause_inference_endpoint",
|
| 935 |
+
"pause_space",
|
| 936 |
+
"permanently_delete_lfs_files",
|
| 937 |
+
"preupload_lfs_files",
|
| 938 |
+
"push_to_hub_fastai",
|
| 939 |
+
"push_to_hub_keras",
|
| 940 |
+
"read_dduf_file",
|
| 941 |
+
"reject_access_request",
|
| 942 |
+
"rename_discussion",
|
| 943 |
+
"repo_exists",
|
| 944 |
+
"repo_info",
|
| 945 |
+
"repo_type_and_id_from_hf_id",
|
| 946 |
+
"request_space_hardware",
|
| 947 |
+
"request_space_storage",
|
| 948 |
+
"restart_space",
|
| 949 |
+
"resume_inference_endpoint",
|
| 950 |
+
"resume_scheduled_job",
|
| 951 |
+
"revision_exists",
|
| 952 |
+
"run_as_future",
|
| 953 |
+
"run_job",
|
| 954 |
+
"run_uv_job",
|
| 955 |
+
"save_pretrained_keras",
|
| 956 |
+
"save_torch_model",
|
| 957 |
+
"save_torch_state_dict",
|
| 958 |
+
"scale_to_zero_inference_endpoint",
|
| 959 |
+
"scan_cache_dir",
|
| 960 |
+
"set_space_sleep_time",
|
| 961 |
+
"snapshot_download",
|
| 962 |
+
"space_info",
|
| 963 |
+
"split_state_dict_into_shards_factory",
|
| 964 |
+
"split_tf_state_dict_into_shards",
|
| 965 |
+
"split_torch_state_dict_into_shards",
|
| 966 |
+
"super_squash_history",
|
| 967 |
+
"suspend_scheduled_job",
|
| 968 |
+
"try_to_load_from_cache",
|
| 969 |
+
"unlike",
|
| 970 |
+
"update_collection_item",
|
| 971 |
+
"update_collection_metadata",
|
| 972 |
+
"update_inference_endpoint",
|
| 973 |
+
"update_repo_settings",
|
| 974 |
+
"update_repo_visibility",
|
| 975 |
+
"update_webhook",
|
| 976 |
+
"upload_file",
|
| 977 |
+
"upload_folder",
|
| 978 |
+
"upload_large_folder",
|
| 979 |
+
"webhook_endpoint",
|
| 980 |
+
"whoami",
|
| 981 |
+
]
|
| 982 |
+
|
| 983 |
+
|
| 984 |
+
def _attach(package_name, submodules=None, submod_attrs=None):
|
| 985 |
+
"""Attach lazily loaded submodules, functions, or other attributes.
|
| 986 |
+
|
| 987 |
+
Typically, modules import submodules and attributes as follows:
|
| 988 |
+
|
| 989 |
+
```py
|
| 990 |
+
import mysubmodule
|
| 991 |
+
import anothersubmodule
|
| 992 |
+
|
| 993 |
+
from .foo import someattr
|
| 994 |
+
```
|
| 995 |
+
|
| 996 |
+
The idea is to replace a package's `__getattr__`, `__dir__`, such that all imports
|
| 997 |
+
work exactly the way they would with normal imports, except that the import occurs
|
| 998 |
+
upon first use.
|
| 999 |
+
|
| 1000 |
+
The typical way to call this function, replacing the above imports, is:
|
| 1001 |
+
|
| 1002 |
+
```python
|
| 1003 |
+
__getattr__, __dir__ = lazy.attach(
|
| 1004 |
+
__name__,
|
| 1005 |
+
['mysubmodule', 'anothersubmodule'],
|
| 1006 |
+
{'foo': ['someattr']}
|
| 1007 |
+
)
|
| 1008 |
+
```
|
| 1009 |
+
This functionality requires Python 3.7 or higher.
|
| 1010 |
+
|
| 1011 |
+
Args:
|
| 1012 |
+
package_name (`str`):
|
| 1013 |
+
Typically use `__name__`.
|
| 1014 |
+
submodules (`set`):
|
| 1015 |
+
List of submodules to attach.
|
| 1016 |
+
submod_attrs (`dict`):
|
| 1017 |
+
Dictionary of submodule -> list of attributes / functions.
|
| 1018 |
+
These attributes are imported as they are used.
|
| 1019 |
+
|
| 1020 |
+
Returns:
|
| 1021 |
+
__getattr__, __dir__, __all__
|
| 1022 |
+
|
| 1023 |
+
"""
|
| 1024 |
+
if submod_attrs is None:
|
| 1025 |
+
submod_attrs = {}
|
| 1026 |
+
|
| 1027 |
+
if submodules is None:
|
| 1028 |
+
submodules = set()
|
| 1029 |
+
else:
|
| 1030 |
+
submodules = set(submodules)
|
| 1031 |
+
|
| 1032 |
+
attr_to_modules = {attr: mod for mod, attrs in submod_attrs.items() for attr in attrs}
|
| 1033 |
+
|
| 1034 |
+
def __getattr__(name):
|
| 1035 |
+
if name in submodules:
|
| 1036 |
+
try:
|
| 1037 |
+
return importlib.import_module(f"{package_name}.{name}")
|
| 1038 |
+
except Exception as e:
|
| 1039 |
+
print(f"Error importing {package_name}.{name}: {e}")
|
| 1040 |
+
raise
|
| 1041 |
+
elif name in attr_to_modules:
|
| 1042 |
+
submod_path = f"{package_name}.{attr_to_modules[name]}"
|
| 1043 |
+
try:
|
| 1044 |
+
submod = importlib.import_module(submod_path)
|
| 1045 |
+
except Exception as e:
|
| 1046 |
+
print(f"Error importing {submod_path}: {e}")
|
| 1047 |
+
raise
|
| 1048 |
+
attr = getattr(submod, name)
|
| 1049 |
+
|
| 1050 |
+
# If the attribute lives in a file (module) with the same
|
| 1051 |
+
# name as the attribute, ensure that the attribute and *not*
|
| 1052 |
+
# the module is accessible on the package.
|
| 1053 |
+
if name == attr_to_modules[name]:
|
| 1054 |
+
pkg = sys.modules[package_name]
|
| 1055 |
+
pkg.__dict__[name] = attr
|
| 1056 |
+
|
| 1057 |
+
return attr
|
| 1058 |
+
else:
|
| 1059 |
+
raise AttributeError(f"No {package_name} attribute {name}")
|
| 1060 |
+
|
| 1061 |
+
def __dir__():
|
| 1062 |
+
return __all__
|
| 1063 |
+
|
| 1064 |
+
return __getattr__, __dir__
|
| 1065 |
+
|
| 1066 |
+
|
| 1067 |
+
__getattr__, __dir__ = _attach(__name__, submodules=[], submod_attrs=_SUBMOD_ATTRS)
|
| 1068 |
+
|
| 1069 |
+
if os.environ.get("EAGER_IMPORT", ""):
|
| 1070 |
+
for attr in __all__:
|
| 1071 |
+
__getattr__(attr)
|
| 1072 |
+
|
| 1073 |
+
# WARNING: any content below this statement is generated automatically. Any manual edit
|
| 1074 |
+
# will be lost when re-generating this file !
|
| 1075 |
+
#
|
| 1076 |
+
# To update the static imports, please run the following command and commit the changes.
|
| 1077 |
+
# ```
|
| 1078 |
+
# # Use script
|
| 1079 |
+
# python utils/check_static_imports.py --update
|
| 1080 |
+
#
|
| 1081 |
+
# # Or run style on codebase
|
| 1082 |
+
# make style
|
| 1083 |
+
# ```
|
| 1084 |
+
if TYPE_CHECKING: # pragma: no cover
|
| 1085 |
+
from ._commit_scheduler import CommitScheduler # noqa: F401
|
| 1086 |
+
from ._inference_endpoints import (
|
| 1087 |
+
InferenceEndpoint, # noqa: F401
|
| 1088 |
+
InferenceEndpointError, # noqa: F401
|
| 1089 |
+
InferenceEndpointStatus, # noqa: F401
|
| 1090 |
+
InferenceEndpointTimeoutError, # noqa: F401
|
| 1091 |
+
InferenceEndpointType, # noqa: F401
|
| 1092 |
+
)
|
| 1093 |
+
from ._jobs_api import (
|
| 1094 |
+
JobInfo, # noqa: F401
|
| 1095 |
+
JobOwner, # noqa: F401
|
| 1096 |
+
JobStage, # noqa: F401
|
| 1097 |
+
JobStatus, # noqa: F401
|
| 1098 |
+
)
|
| 1099 |
+
from ._login import (
|
| 1100 |
+
auth_list, # noqa: F401
|
| 1101 |
+
auth_switch, # noqa: F401
|
| 1102 |
+
interpreter_login, # noqa: F401
|
| 1103 |
+
login, # noqa: F401
|
| 1104 |
+
logout, # noqa: F401
|
| 1105 |
+
notebook_login, # noqa: F401
|
| 1106 |
+
)
|
| 1107 |
+
from ._oauth import (
|
| 1108 |
+
OAuthInfo, # noqa: F401
|
| 1109 |
+
OAuthOrgInfo, # noqa: F401
|
| 1110 |
+
OAuthUserInfo, # noqa: F401
|
| 1111 |
+
attach_huggingface_oauth, # noqa: F401
|
| 1112 |
+
parse_huggingface_oauth, # noqa: F401
|
| 1113 |
+
)
|
| 1114 |
+
from ._snapshot_download import snapshot_download # noqa: F401
|
| 1115 |
+
from ._space_api import (
|
| 1116 |
+
SpaceHardware, # noqa: F401
|
| 1117 |
+
SpaceRuntime, # noqa: F401
|
| 1118 |
+
SpaceStage, # noqa: F401
|
| 1119 |
+
SpaceStorage, # noqa: F401
|
| 1120 |
+
SpaceVariable, # noqa: F401
|
| 1121 |
+
)
|
| 1122 |
+
from ._tensorboard_logger import HFSummaryWriter # noqa: F401
|
| 1123 |
+
from ._webhooks_payload import (
|
| 1124 |
+
WebhookPayload, # noqa: F401
|
| 1125 |
+
WebhookPayloadComment, # noqa: F401
|
| 1126 |
+
WebhookPayloadDiscussion, # noqa: F401
|
| 1127 |
+
WebhookPayloadDiscussionChanges, # noqa: F401
|
| 1128 |
+
WebhookPayloadEvent, # noqa: F401
|
| 1129 |
+
WebhookPayloadMovedTo, # noqa: F401
|
| 1130 |
+
WebhookPayloadRepo, # noqa: F401
|
| 1131 |
+
WebhookPayloadUrl, # noqa: F401
|
| 1132 |
+
WebhookPayloadWebhook, # noqa: F401
|
| 1133 |
+
)
|
| 1134 |
+
from ._webhooks_server import (
|
| 1135 |
+
WebhooksServer, # noqa: F401
|
| 1136 |
+
webhook_endpoint, # noqa: F401
|
| 1137 |
+
)
|
| 1138 |
+
from .community import (
|
| 1139 |
+
Discussion, # noqa: F401
|
| 1140 |
+
DiscussionComment, # noqa: F401
|
| 1141 |
+
DiscussionCommit, # noqa: F401
|
| 1142 |
+
DiscussionEvent, # noqa: F401
|
| 1143 |
+
DiscussionStatusChange, # noqa: F401
|
| 1144 |
+
DiscussionTitleChange, # noqa: F401
|
| 1145 |
+
DiscussionWithDetails, # noqa: F401
|
| 1146 |
+
)
|
| 1147 |
+
from .constants import (
|
| 1148 |
+
CONFIG_NAME, # noqa: F401
|
| 1149 |
+
FLAX_WEIGHTS_NAME, # noqa: F401
|
| 1150 |
+
HUGGINGFACE_CO_URL_HOME, # noqa: F401
|
| 1151 |
+
HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401
|
| 1152 |
+
PYTORCH_WEIGHTS_NAME, # noqa: F401
|
| 1153 |
+
REPO_TYPE_DATASET, # noqa: F401
|
| 1154 |
+
REPO_TYPE_MODEL, # noqa: F401
|
| 1155 |
+
REPO_TYPE_SPACE, # noqa: F401
|
| 1156 |
+
TF2_WEIGHTS_NAME, # noqa: F401
|
| 1157 |
+
TF_WEIGHTS_NAME, # noqa: F401
|
| 1158 |
+
)
|
| 1159 |
+
from .fastai_utils import (
|
| 1160 |
+
_save_pretrained_fastai, # noqa: F401
|
| 1161 |
+
from_pretrained_fastai, # noqa: F401
|
| 1162 |
+
push_to_hub_fastai, # noqa: F401
|
| 1163 |
+
)
|
| 1164 |
+
from .file_download import (
|
| 1165 |
+
_CACHED_NO_EXIST, # noqa: F401
|
| 1166 |
+
HfFileMetadata, # noqa: F401
|
| 1167 |
+
get_hf_file_metadata, # noqa: F401
|
| 1168 |
+
hf_hub_download, # noqa: F401
|
| 1169 |
+
hf_hub_url, # noqa: F401
|
| 1170 |
+
try_to_load_from_cache, # noqa: F401
|
| 1171 |
+
)
|
| 1172 |
+
from .hf_api import (
|
| 1173 |
+
Collection, # noqa: F401
|
| 1174 |
+
CollectionItem, # noqa: F401
|
| 1175 |
+
CommitInfo, # noqa: F401
|
| 1176 |
+
CommitOperation, # noqa: F401
|
| 1177 |
+
CommitOperationAdd, # noqa: F401
|
| 1178 |
+
CommitOperationCopy, # noqa: F401
|
| 1179 |
+
CommitOperationDelete, # noqa: F401
|
| 1180 |
+
DatasetInfo, # noqa: F401
|
| 1181 |
+
GitCommitInfo, # noqa: F401
|
| 1182 |
+
GitRefInfo, # noqa: F401
|
| 1183 |
+
GitRefs, # noqa: F401
|
| 1184 |
+
HfApi, # noqa: F401
|
| 1185 |
+
ModelInfo, # noqa: F401
|
| 1186 |
+
Organization, # noqa: F401
|
| 1187 |
+
RepoUrl, # noqa: F401
|
| 1188 |
+
SpaceInfo, # noqa: F401
|
| 1189 |
+
User, # noqa: F401
|
| 1190 |
+
UserLikes, # noqa: F401
|
| 1191 |
+
WebhookInfo, # noqa: F401
|
| 1192 |
+
WebhookWatchedItem, # noqa: F401
|
| 1193 |
+
accept_access_request, # noqa: F401
|
| 1194 |
+
add_collection_item, # noqa: F401
|
| 1195 |
+
add_space_secret, # noqa: F401
|
| 1196 |
+
add_space_variable, # noqa: F401
|
| 1197 |
+
auth_check, # noqa: F401
|
| 1198 |
+
cancel_access_request, # noqa: F401
|
| 1199 |
+
cancel_job, # noqa: F401
|
| 1200 |
+
change_discussion_status, # noqa: F401
|
| 1201 |
+
comment_discussion, # noqa: F401
|
| 1202 |
+
create_branch, # noqa: F401
|
| 1203 |
+
create_collection, # noqa: F401
|
| 1204 |
+
create_commit, # noqa: F401
|
| 1205 |
+
create_discussion, # noqa: F401
|
| 1206 |
+
create_inference_endpoint, # noqa: F401
|
| 1207 |
+
create_inference_endpoint_from_catalog, # noqa: F401
|
| 1208 |
+
create_pull_request, # noqa: F401
|
| 1209 |
+
create_repo, # noqa: F401
|
| 1210 |
+
create_scheduled_job, # noqa: F401
|
| 1211 |
+
create_scheduled_uv_job, # noqa: F401
|
| 1212 |
+
create_tag, # noqa: F401
|
| 1213 |
+
create_webhook, # noqa: F401
|
| 1214 |
+
dataset_info, # noqa: F401
|
| 1215 |
+
delete_branch, # noqa: F401
|
| 1216 |
+
delete_collection, # noqa: F401
|
| 1217 |
+
delete_collection_item, # noqa: F401
|
| 1218 |
+
delete_file, # noqa: F401
|
| 1219 |
+
delete_folder, # noqa: F401
|
| 1220 |
+
delete_inference_endpoint, # noqa: F401
|
| 1221 |
+
delete_repo, # noqa: F401
|
| 1222 |
+
delete_scheduled_job, # noqa: F401
|
| 1223 |
+
delete_space_secret, # noqa: F401
|
| 1224 |
+
delete_space_storage, # noqa: F401
|
| 1225 |
+
delete_space_variable, # noqa: F401
|
| 1226 |
+
delete_tag, # noqa: F401
|
| 1227 |
+
delete_webhook, # noqa: F401
|
| 1228 |
+
disable_webhook, # noqa: F401
|
| 1229 |
+
duplicate_space, # noqa: F401
|
| 1230 |
+
edit_discussion_comment, # noqa: F401
|
| 1231 |
+
enable_webhook, # noqa: F401
|
| 1232 |
+
fetch_job_logs, # noqa: F401
|
| 1233 |
+
file_exists, # noqa: F401
|
| 1234 |
+
get_collection, # noqa: F401
|
| 1235 |
+
get_dataset_tags, # noqa: F401
|
| 1236 |
+
get_discussion_details, # noqa: F401
|
| 1237 |
+
get_full_repo_name, # noqa: F401
|
| 1238 |
+
get_inference_endpoint, # noqa: F401
|
| 1239 |
+
get_model_tags, # noqa: F401
|
| 1240 |
+
get_organization_overview, # noqa: F401
|
| 1241 |
+
get_paths_info, # noqa: F401
|
| 1242 |
+
get_repo_discussions, # noqa: F401
|
| 1243 |
+
get_safetensors_metadata, # noqa: F401
|
| 1244 |
+
get_space_runtime, # noqa: F401
|
| 1245 |
+
get_space_variables, # noqa: F401
|
| 1246 |
+
get_token_permission, # noqa: F401
|
| 1247 |
+
get_user_overview, # noqa: F401
|
| 1248 |
+
get_webhook, # noqa: F401
|
| 1249 |
+
grant_access, # noqa: F401
|
| 1250 |
+
inspect_job, # noqa: F401
|
| 1251 |
+
inspect_scheduled_job, # noqa: F401
|
| 1252 |
+
list_accepted_access_requests, # noqa: F401
|
| 1253 |
+
list_collections, # noqa: F401
|
| 1254 |
+
list_datasets, # noqa: F401
|
| 1255 |
+
list_inference_catalog, # noqa: F401
|
| 1256 |
+
list_inference_endpoints, # noqa: F401
|
| 1257 |
+
list_jobs, # noqa: F401
|
| 1258 |
+
list_lfs_files, # noqa: F401
|
| 1259 |
+
list_liked_repos, # noqa: F401
|
| 1260 |
+
list_models, # noqa: F401
|
| 1261 |
+
list_organization_members, # noqa: F401
|
| 1262 |
+
list_papers, # noqa: F401
|
| 1263 |
+
list_pending_access_requests, # noqa: F401
|
| 1264 |
+
list_rejected_access_requests, # noqa: F401
|
| 1265 |
+
list_repo_commits, # noqa: F401
|
| 1266 |
+
list_repo_files, # noqa: F401
|
| 1267 |
+
list_repo_likers, # noqa: F401
|
| 1268 |
+
list_repo_refs, # noqa: F401
|
| 1269 |
+
list_repo_tree, # noqa: F401
|
| 1270 |
+
list_spaces, # noqa: F401
|
| 1271 |
+
list_user_followers, # noqa: F401
|
| 1272 |
+
list_user_following, # noqa: F401
|
| 1273 |
+
list_webhooks, # noqa: F401
|
| 1274 |
+
merge_pull_request, # noqa: F401
|
| 1275 |
+
model_info, # noqa: F401
|
| 1276 |
+
move_repo, # noqa: F401
|
| 1277 |
+
paper_info, # noqa: F401
|
| 1278 |
+
parse_safetensors_file_metadata, # noqa: F401
|
| 1279 |
+
pause_inference_endpoint, # noqa: F401
|
| 1280 |
+
pause_space, # noqa: F401
|
| 1281 |
+
permanently_delete_lfs_files, # noqa: F401
|
| 1282 |
+
preupload_lfs_files, # noqa: F401
|
| 1283 |
+
reject_access_request, # noqa: F401
|
| 1284 |
+
rename_discussion, # noqa: F401
|
| 1285 |
+
repo_exists, # noqa: F401
|
| 1286 |
+
repo_info, # noqa: F401
|
| 1287 |
+
repo_type_and_id_from_hf_id, # noqa: F401
|
| 1288 |
+
request_space_hardware, # noqa: F401
|
| 1289 |
+
request_space_storage, # noqa: F401
|
| 1290 |
+
restart_space, # noqa: F401
|
| 1291 |
+
resume_inference_endpoint, # noqa: F401
|
| 1292 |
+
resume_scheduled_job, # noqa: F401
|
| 1293 |
+
revision_exists, # noqa: F401
|
| 1294 |
+
run_as_future, # noqa: F401
|
| 1295 |
+
run_job, # noqa: F401
|
| 1296 |
+
run_uv_job, # noqa: F401
|
| 1297 |
+
scale_to_zero_inference_endpoint, # noqa: F401
|
| 1298 |
+
set_space_sleep_time, # noqa: F401
|
| 1299 |
+
space_info, # noqa: F401
|
| 1300 |
+
super_squash_history, # noqa: F401
|
| 1301 |
+
suspend_scheduled_job, # noqa: F401
|
| 1302 |
+
unlike, # noqa: F401
|
| 1303 |
+
update_collection_item, # noqa: F401
|
| 1304 |
+
update_collection_metadata, # noqa: F401
|
| 1305 |
+
update_inference_endpoint, # noqa: F401
|
| 1306 |
+
update_repo_settings, # noqa: F401
|
| 1307 |
+
update_repo_visibility, # noqa: F401
|
| 1308 |
+
update_webhook, # noqa: F401
|
| 1309 |
+
upload_file, # noqa: F401
|
| 1310 |
+
upload_folder, # noqa: F401
|
| 1311 |
+
upload_large_folder, # noqa: F401
|
| 1312 |
+
whoami, # noqa: F401
|
| 1313 |
+
)
|
| 1314 |
+
from .hf_file_system import (
|
| 1315 |
+
HfFileSystem, # noqa: F401
|
| 1316 |
+
HfFileSystemFile, # noqa: F401
|
| 1317 |
+
HfFileSystemResolvedPath, # noqa: F401
|
| 1318 |
+
HfFileSystemStreamFile, # noqa: F401
|
| 1319 |
+
)
|
| 1320 |
+
from .hub_mixin import (
|
| 1321 |
+
ModelHubMixin, # noqa: F401
|
| 1322 |
+
PyTorchModelHubMixin, # noqa: F401
|
| 1323 |
+
)
|
| 1324 |
+
from .inference._client import (
|
| 1325 |
+
InferenceClient, # noqa: F401
|
| 1326 |
+
InferenceTimeoutError, # noqa: F401
|
| 1327 |
+
)
|
| 1328 |
+
from .inference._generated._async_client import AsyncInferenceClient # noqa: F401
|
| 1329 |
+
from .inference._generated.types import (
|
| 1330 |
+
AudioClassificationInput, # noqa: F401
|
| 1331 |
+
AudioClassificationOutputElement, # noqa: F401
|
| 1332 |
+
AudioClassificationOutputTransform, # noqa: F401
|
| 1333 |
+
AudioClassificationParameters, # noqa: F401
|
| 1334 |
+
AudioToAudioInput, # noqa: F401
|
| 1335 |
+
AudioToAudioOutputElement, # noqa: F401
|
| 1336 |
+
AutomaticSpeechRecognitionEarlyStoppingEnum, # noqa: F401
|
| 1337 |
+
AutomaticSpeechRecognitionGenerationParameters, # noqa: F401
|
| 1338 |
+
AutomaticSpeechRecognitionInput, # noqa: F401
|
| 1339 |
+
AutomaticSpeechRecognitionOutput, # noqa: F401
|
| 1340 |
+
AutomaticSpeechRecognitionOutputChunk, # noqa: F401
|
| 1341 |
+
AutomaticSpeechRecognitionParameters, # noqa: F401
|
| 1342 |
+
ChatCompletionInput, # noqa: F401
|
| 1343 |
+
ChatCompletionInputFunctionDefinition, # noqa: F401
|
| 1344 |
+
ChatCompletionInputFunctionName, # noqa: F401
|
| 1345 |
+
ChatCompletionInputGrammarType, # noqa: F401
|
| 1346 |
+
ChatCompletionInputJSONSchema, # noqa: F401
|
| 1347 |
+
ChatCompletionInputMessage, # noqa: F401
|
| 1348 |
+
ChatCompletionInputMessageChunk, # noqa: F401
|
| 1349 |
+
ChatCompletionInputMessageChunkType, # noqa: F401
|
| 1350 |
+
ChatCompletionInputResponseFormatJSONObject, # noqa: F401
|
| 1351 |
+
ChatCompletionInputResponseFormatJSONSchema, # noqa: F401
|
| 1352 |
+
ChatCompletionInputResponseFormatText, # noqa: F401
|
| 1353 |
+
ChatCompletionInputStreamOptions, # noqa: F401
|
| 1354 |
+
ChatCompletionInputTool, # noqa: F401
|
| 1355 |
+
ChatCompletionInputToolCall, # noqa: F401
|
| 1356 |
+
ChatCompletionInputToolChoiceClass, # noqa: F401
|
| 1357 |
+
ChatCompletionInputToolChoiceEnum, # noqa: F401
|
| 1358 |
+
ChatCompletionInputURL, # noqa: F401
|
| 1359 |
+
ChatCompletionOutput, # noqa: F401
|
| 1360 |
+
ChatCompletionOutputComplete, # noqa: F401
|
| 1361 |
+
ChatCompletionOutputFunctionDefinition, # noqa: F401
|
| 1362 |
+
ChatCompletionOutputLogprob, # noqa: F401
|
| 1363 |
+
ChatCompletionOutputLogprobs, # noqa: F401
|
| 1364 |
+
ChatCompletionOutputMessage, # noqa: F401
|
| 1365 |
+
ChatCompletionOutputToolCall, # noqa: F401
|
| 1366 |
+
ChatCompletionOutputTopLogprob, # noqa: F401
|
| 1367 |
+
ChatCompletionOutputUsage, # noqa: F401
|
| 1368 |
+
ChatCompletionStreamOutput, # noqa: F401
|
| 1369 |
+
ChatCompletionStreamOutputChoice, # noqa: F401
|
| 1370 |
+
ChatCompletionStreamOutputDelta, # noqa: F401
|
| 1371 |
+
ChatCompletionStreamOutputDeltaToolCall, # noqa: F401
|
| 1372 |
+
ChatCompletionStreamOutputFunction, # noqa: F401
|
| 1373 |
+
ChatCompletionStreamOutputLogprob, # noqa: F401
|
| 1374 |
+
ChatCompletionStreamOutputLogprobs, # noqa: F401
|
| 1375 |
+
ChatCompletionStreamOutputTopLogprob, # noqa: F401
|
| 1376 |
+
ChatCompletionStreamOutputUsage, # noqa: F401
|
| 1377 |
+
DepthEstimationInput, # noqa: F401
|
| 1378 |
+
DepthEstimationOutput, # noqa: F401
|
| 1379 |
+
DocumentQuestionAnsweringInput, # noqa: F401
|
| 1380 |
+
DocumentQuestionAnsweringInputData, # noqa: F401
|
| 1381 |
+
DocumentQuestionAnsweringOutputElement, # noqa: F401
|
| 1382 |
+
DocumentQuestionAnsweringParameters, # noqa: F401
|
| 1383 |
+
FeatureExtractionInput, # noqa: F401
|
| 1384 |
+
FeatureExtractionInputTruncationDirection, # noqa: F401
|
| 1385 |
+
FillMaskInput, # noqa: F401
|
| 1386 |
+
FillMaskOutputElement, # noqa: F401
|
| 1387 |
+
FillMaskParameters, # noqa: F401
|
| 1388 |
+
ImageClassificationInput, # noqa: F401
|
| 1389 |
+
ImageClassificationOutputElement, # noqa: F401
|
| 1390 |
+
ImageClassificationOutputTransform, # noqa: F401
|
| 1391 |
+
ImageClassificationParameters, # noqa: F401
|
| 1392 |
+
ImageSegmentationInput, # noqa: F401
|
| 1393 |
+
ImageSegmentationOutputElement, # noqa: F401
|
| 1394 |
+
ImageSegmentationParameters, # noqa: F401
|
| 1395 |
+
ImageSegmentationSubtask, # noqa: F401
|
| 1396 |
+
ImageToImageInput, # noqa: F401
|
| 1397 |
+
ImageToImageOutput, # noqa: F401
|
| 1398 |
+
ImageToImageParameters, # noqa: F401
|
| 1399 |
+
ImageToImageTargetSize, # noqa: F401
|
| 1400 |
+
ImageToTextEarlyStoppingEnum, # noqa: F401
|
| 1401 |
+
ImageToTextGenerationParameters, # noqa: F401
|
| 1402 |
+
ImageToTextInput, # noqa: F401
|
| 1403 |
+
ImageToTextOutput, # noqa: F401
|
| 1404 |
+
ImageToTextParameters, # noqa: F401
|
| 1405 |
+
ImageToVideoInput, # noqa: F401
|
| 1406 |
+
ImageToVideoOutput, # noqa: F401
|
| 1407 |
+
ImageToVideoParameters, # noqa: F401
|
| 1408 |
+
ImageToVideoTargetSize, # noqa: F401
|
| 1409 |
+
ObjectDetectionBoundingBox, # noqa: F401
|
| 1410 |
+
ObjectDetectionInput, # noqa: F401
|
| 1411 |
+
ObjectDetectionOutputElement, # noqa: F401
|
| 1412 |
+
ObjectDetectionParameters, # noqa: F401
|
| 1413 |
+
Padding, # noqa: F401
|
| 1414 |
+
QuestionAnsweringInput, # noqa: F401
|
| 1415 |
+
QuestionAnsweringInputData, # noqa: F401
|
| 1416 |
+
QuestionAnsweringOutputElement, # noqa: F401
|
| 1417 |
+
QuestionAnsweringParameters, # noqa: F401
|
| 1418 |
+
SentenceSimilarityInput, # noqa: F401
|
| 1419 |
+
SentenceSimilarityInputData, # noqa: F401
|
| 1420 |
+
SummarizationInput, # noqa: F401
|
| 1421 |
+
SummarizationOutput, # noqa: F401
|
| 1422 |
+
SummarizationParameters, # noqa: F401
|
| 1423 |
+
SummarizationTruncationStrategy, # noqa: F401
|
| 1424 |
+
TableQuestionAnsweringInput, # noqa: F401
|
| 1425 |
+
TableQuestionAnsweringInputData, # noqa: F401
|
| 1426 |
+
TableQuestionAnsweringOutputElement, # noqa: F401
|
| 1427 |
+
TableQuestionAnsweringParameters, # noqa: F401
|
| 1428 |
+
Text2TextGenerationInput, # noqa: F401
|
| 1429 |
+
Text2TextGenerationOutput, # noqa: F401
|
| 1430 |
+
Text2TextGenerationParameters, # noqa: F401
|
| 1431 |
+
Text2TextGenerationTruncationStrategy, # noqa: F401
|
| 1432 |
+
TextClassificationInput, # noqa: F401
|
| 1433 |
+
TextClassificationOutputElement, # noqa: F401
|
| 1434 |
+
TextClassificationOutputTransform, # noqa: F401
|
| 1435 |
+
TextClassificationParameters, # noqa: F401
|
| 1436 |
+
TextGenerationInput, # noqa: F401
|
| 1437 |
+
TextGenerationInputGenerateParameters, # noqa: F401
|
| 1438 |
+
TextGenerationInputGrammarType, # noqa: F401
|
| 1439 |
+
TextGenerationOutput, # noqa: F401
|
| 1440 |
+
TextGenerationOutputBestOfSequence, # noqa: F401
|
| 1441 |
+
TextGenerationOutputDetails, # noqa: F401
|
| 1442 |
+
TextGenerationOutputFinishReason, # noqa: F401
|
| 1443 |
+
TextGenerationOutputPrefillToken, # noqa: F401
|
| 1444 |
+
TextGenerationOutputToken, # noqa: F401
|
| 1445 |
+
TextGenerationStreamOutput, # noqa: F401
|
| 1446 |
+
TextGenerationStreamOutputStreamDetails, # noqa: F401
|
| 1447 |
+
TextGenerationStreamOutputToken, # noqa: F401
|
| 1448 |
+
TextToAudioEarlyStoppingEnum, # noqa: F401
|
| 1449 |
+
TextToAudioGenerationParameters, # noqa: F401
|
| 1450 |
+
TextToAudioInput, # noqa: F401
|
| 1451 |
+
TextToAudioOutput, # noqa: F401
|
| 1452 |
+
TextToAudioParameters, # noqa: F401
|
| 1453 |
+
TextToImageInput, # noqa: F401
|
| 1454 |
+
TextToImageOutput, # noqa: F401
|
| 1455 |
+
TextToImageParameters, # noqa: F401
|
| 1456 |
+
TextToSpeechEarlyStoppingEnum, # noqa: F401
|
| 1457 |
+
TextToSpeechGenerationParameters, # noqa: F401
|
| 1458 |
+
TextToSpeechInput, # noqa: F401
|
| 1459 |
+
TextToSpeechOutput, # noqa: F401
|
| 1460 |
+
TextToSpeechParameters, # noqa: F401
|
| 1461 |
+
TextToVideoInput, # noqa: F401
|
| 1462 |
+
TextToVideoOutput, # noqa: F401
|
| 1463 |
+
TextToVideoParameters, # noqa: F401
|
| 1464 |
+
TokenClassificationAggregationStrategy, # noqa: F401
|
| 1465 |
+
TokenClassificationInput, # noqa: F401
|
| 1466 |
+
TokenClassificationOutputElement, # noqa: F401
|
| 1467 |
+
TokenClassificationParameters, # noqa: F401
|
| 1468 |
+
TranslationInput, # noqa: F401
|
| 1469 |
+
TranslationOutput, # noqa: F401
|
| 1470 |
+
TranslationParameters, # noqa: F401
|
| 1471 |
+
TranslationTruncationStrategy, # noqa: F401
|
| 1472 |
+
TypeEnum, # noqa: F401
|
| 1473 |
+
VideoClassificationInput, # noqa: F401
|
| 1474 |
+
VideoClassificationOutputElement, # noqa: F401
|
| 1475 |
+
VideoClassificationOutputTransform, # noqa: F401
|
| 1476 |
+
VideoClassificationParameters, # noqa: F401
|
| 1477 |
+
VisualQuestionAnsweringInput, # noqa: F401
|
| 1478 |
+
VisualQuestionAnsweringInputData, # noqa: F401
|
| 1479 |
+
VisualQuestionAnsweringOutputElement, # noqa: F401
|
| 1480 |
+
VisualQuestionAnsweringParameters, # noqa: F401
|
| 1481 |
+
ZeroShotClassificationInput, # noqa: F401
|
| 1482 |
+
ZeroShotClassificationOutputElement, # noqa: F401
|
| 1483 |
+
ZeroShotClassificationParameters, # noqa: F401
|
| 1484 |
+
ZeroShotImageClassificationInput, # noqa: F401
|
| 1485 |
+
ZeroShotImageClassificationOutputElement, # noqa: F401
|
| 1486 |
+
ZeroShotImageClassificationParameters, # noqa: F401
|
| 1487 |
+
ZeroShotObjectDetectionBoundingBox, # noqa: F401
|
| 1488 |
+
ZeroShotObjectDetectionInput, # noqa: F401
|
| 1489 |
+
ZeroShotObjectDetectionOutputElement, # noqa: F401
|
| 1490 |
+
ZeroShotObjectDetectionParameters, # noqa: F401
|
| 1491 |
+
)
|
| 1492 |
+
from .inference._mcp.agent import Agent # noqa: F401
|
| 1493 |
+
from .inference._mcp.mcp_client import MCPClient # noqa: F401
|
| 1494 |
+
from .inference_api import InferenceApi # noqa: F401
|
| 1495 |
+
from .keras_mixin import (
|
| 1496 |
+
KerasModelHubMixin, # noqa: F401
|
| 1497 |
+
from_pretrained_keras, # noqa: F401
|
| 1498 |
+
push_to_hub_keras, # noqa: F401
|
| 1499 |
+
save_pretrained_keras, # noqa: F401
|
| 1500 |
+
)
|
| 1501 |
+
from .repocard import (
|
| 1502 |
+
DatasetCard, # noqa: F401
|
| 1503 |
+
ModelCard, # noqa: F401
|
| 1504 |
+
RepoCard, # noqa: F401
|
| 1505 |
+
SpaceCard, # noqa: F401
|
| 1506 |
+
metadata_eval_result, # noqa: F401
|
| 1507 |
+
metadata_load, # noqa: F401
|
| 1508 |
+
metadata_save, # noqa: F401
|
| 1509 |
+
metadata_update, # noqa: F401
|
| 1510 |
+
)
|
| 1511 |
+
from .repocard_data import (
|
| 1512 |
+
CardData, # noqa: F401
|
| 1513 |
+
DatasetCardData, # noqa: F401
|
| 1514 |
+
EvalResult, # noqa: F401
|
| 1515 |
+
ModelCardData, # noqa: F401
|
| 1516 |
+
SpaceCardData, # noqa: F401
|
| 1517 |
+
)
|
| 1518 |
+
from .repository import Repository # noqa: F401
|
| 1519 |
+
from .serialization import (
|
| 1520 |
+
StateDictSplit, # noqa: F401
|
| 1521 |
+
get_tf_storage_size, # noqa: F401
|
| 1522 |
+
get_torch_storage_id, # noqa: F401
|
| 1523 |
+
get_torch_storage_size, # noqa: F401
|
| 1524 |
+
load_state_dict_from_file, # noqa: F401
|
| 1525 |
+
load_torch_model, # noqa: F401
|
| 1526 |
+
save_torch_model, # noqa: F401
|
| 1527 |
+
save_torch_state_dict, # noqa: F401
|
| 1528 |
+
split_state_dict_into_shards_factory, # noqa: F401
|
| 1529 |
+
split_tf_state_dict_into_shards, # noqa: F401
|
| 1530 |
+
split_torch_state_dict_into_shards, # noqa: F401
|
| 1531 |
+
)
|
| 1532 |
+
from .serialization._dduf import (
|
| 1533 |
+
DDUFEntry, # noqa: F401
|
| 1534 |
+
export_entries_as_dduf, # noqa: F401
|
| 1535 |
+
export_folder_as_dduf, # noqa: F401
|
| 1536 |
+
read_dduf_file, # noqa: F401
|
| 1537 |
+
)
|
| 1538 |
+
from .utils import (
|
| 1539 |
+
CachedFileInfo, # noqa: F401
|
| 1540 |
+
CachedRepoInfo, # noqa: F401
|
| 1541 |
+
CachedRevisionInfo, # noqa: F401
|
| 1542 |
+
CacheNotFound, # noqa: F401
|
| 1543 |
+
CorruptedCacheException, # noqa: F401
|
| 1544 |
+
DeleteCacheStrategy, # noqa: F401
|
| 1545 |
+
HFCacheInfo, # noqa: F401
|
| 1546 |
+
HfFolder, # noqa: F401
|
| 1547 |
+
cached_assets_path, # noqa: F401
|
| 1548 |
+
configure_http_backend, # noqa: F401
|
| 1549 |
+
dump_environment_info, # noqa: F401
|
| 1550 |
+
get_session, # noqa: F401
|
| 1551 |
+
get_token, # noqa: F401
|
| 1552 |
+
logging, # noqa: F401
|
| 1553 |
+
scan_cache_dir, # noqa: F401
|
| 1554 |
+
)
|
venv/lib/python3.13/site-packages/huggingface_hub/_commit_api.py
ADDED
|
@@ -0,0 +1,968 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Type definitions and utilities for the `create_commit` API
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import base64
|
| 6 |
+
import io
|
| 7 |
+
import os
|
| 8 |
+
import warnings
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from contextlib import contextmanager
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from itertools import groupby
|
| 13 |
+
from pathlib import Path, PurePosixPath
|
| 14 |
+
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union
|
| 15 |
+
|
| 16 |
+
from tqdm.contrib.concurrent import thread_map
|
| 17 |
+
|
| 18 |
+
from . import constants
|
| 19 |
+
from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
|
| 20 |
+
from .file_download import hf_hub_url
|
| 21 |
+
from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
|
| 22 |
+
from .utils import (
|
| 23 |
+
FORBIDDEN_FOLDERS,
|
| 24 |
+
XetTokenType,
|
| 25 |
+
are_progress_bars_disabled,
|
| 26 |
+
chunk_iterable,
|
| 27 |
+
fetch_xet_connection_info_from_repo_info,
|
| 28 |
+
get_session,
|
| 29 |
+
hf_raise_for_status,
|
| 30 |
+
logging,
|
| 31 |
+
sha,
|
| 32 |
+
tqdm_stream_file,
|
| 33 |
+
validate_hf_hub_args,
|
| 34 |
+
)
|
| 35 |
+
from .utils import tqdm as hf_tqdm
|
| 36 |
+
from .utils._runtime import is_xet_available
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if TYPE_CHECKING:
|
| 40 |
+
from .hf_api import RepoFile
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
logger = logging.get_logger(__name__)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
UploadMode = Literal["lfs", "regular"]
|
| 47 |
+
|
| 48 |
+
# Max is 1,000 per request on the Hub for HfApi.get_paths_info
|
| 49 |
+
# Otherwise we get:
|
| 50 |
+
# HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
|
| 51 |
+
# See https://github.com/huggingface/huggingface_hub/issues/1503
|
| 52 |
+
FETCH_LFS_BATCH_SIZE = 500
|
| 53 |
+
|
| 54 |
+
UPLOAD_BATCH_MAX_NUM_FILES = 256
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class CommitOperationDelete:
|
| 59 |
+
"""
|
| 60 |
+
Data structure holding necessary info to delete a file or a folder from a repository
|
| 61 |
+
on the Hub.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
path_in_repo (`str`):
|
| 65 |
+
Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
|
| 66 |
+
for a file or `"checkpoints/1fec34a/"` for a folder.
|
| 67 |
+
is_folder (`bool` or `Literal["auto"]`, *optional*)
|
| 68 |
+
Whether the Delete Operation applies to a folder or not. If "auto", the path
|
| 69 |
+
type (file or folder) is guessed automatically by looking if path ends with
|
| 70 |
+
a "/" (folder) or not (file). To explicitly set the path type, you can set
|
| 71 |
+
`is_folder=True` or `is_folder=False`.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
path_in_repo: str
|
| 75 |
+
is_folder: Union[bool, Literal["auto"]] = "auto"
|
| 76 |
+
|
| 77 |
+
def __post_init__(self):
|
| 78 |
+
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
| 79 |
+
|
| 80 |
+
if self.is_folder == "auto":
|
| 81 |
+
self.is_folder = self.path_in_repo.endswith("/")
|
| 82 |
+
if not isinstance(self.is_folder, bool):
|
| 83 |
+
raise ValueError(
|
| 84 |
+
f"Wrong value for `is_folder`. Must be one of [`True`, `False`, `'auto'`]. Got '{self.is_folder}'."
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@dataclass
|
| 89 |
+
class CommitOperationCopy:
|
| 90 |
+
"""
|
| 91 |
+
Data structure holding necessary info to copy a file in a repository on the Hub.
|
| 92 |
+
|
| 93 |
+
Limitations:
|
| 94 |
+
- Only LFS files can be copied. To copy a regular file, you need to download it locally and re-upload it
|
| 95 |
+
- Cross-repository copies are not supported.
|
| 96 |
+
|
| 97 |
+
Note: you can combine a [`CommitOperationCopy`] and a [`CommitOperationDelete`] to rename an LFS file on the Hub.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
src_path_in_repo (`str`):
|
| 101 |
+
Relative filepath in the repo of the file to be copied, e.g. `"checkpoints/1fec34a/weights.bin"`.
|
| 102 |
+
path_in_repo (`str`):
|
| 103 |
+
Relative filepath in the repo where to copy the file, e.g. `"checkpoints/1fec34a/weights_copy.bin"`.
|
| 104 |
+
src_revision (`str`, *optional*):
|
| 105 |
+
The git revision of the file to be copied. Can be any valid git revision.
|
| 106 |
+
Default to the target commit revision.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
src_path_in_repo: str
|
| 110 |
+
path_in_repo: str
|
| 111 |
+
src_revision: Optional[str] = None
|
| 112 |
+
# set to the OID of the file to be copied if it has already been uploaded
|
| 113 |
+
# useful to determine if a commit will be empty or not.
|
| 114 |
+
_src_oid: Optional[str] = None
|
| 115 |
+
# set to the OID of the file to copy to if it has already been uploaded
|
| 116 |
+
# useful to determine if a commit will be empty or not.
|
| 117 |
+
_dest_oid: Optional[str] = None
|
| 118 |
+
|
| 119 |
+
def __post_init__(self):
|
| 120 |
+
self.src_path_in_repo = _validate_path_in_repo(self.src_path_in_repo)
|
| 121 |
+
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@dataclass
|
| 125 |
+
class CommitOperationAdd:
|
| 126 |
+
"""
|
| 127 |
+
Data structure holding necessary info to upload a file to a repository on the Hub.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
path_in_repo (`str`):
|
| 131 |
+
Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
|
| 132 |
+
path_or_fileobj (`str`, `Path`, `bytes`, or `BinaryIO`):
|
| 133 |
+
Either:
|
| 134 |
+
- a path to a local file (as `str` or `pathlib.Path`) to upload
|
| 135 |
+
- a buffer of bytes (`bytes`) holding the content of the file to upload
|
| 136 |
+
- a "file object" (subclass of `io.BufferedIOBase`), typically obtained
|
| 137 |
+
with `open(path, "rb")`. It must support `seek()` and `tell()` methods.
|
| 138 |
+
|
| 139 |
+
Raises:
|
| 140 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 141 |
+
If `path_or_fileobj` is not one of `str`, `Path`, `bytes` or `io.BufferedIOBase`.
|
| 142 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 143 |
+
If `path_or_fileobj` is a `str` or `Path` but not a path to an existing file.
|
| 144 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 145 |
+
If `path_or_fileobj` is a `io.BufferedIOBase` but it doesn't support both
|
| 146 |
+
`seek()` and `tell()`.
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
path_in_repo: str
|
| 150 |
+
path_or_fileobj: Union[str, Path, bytes, BinaryIO]
|
| 151 |
+
upload_info: UploadInfo = field(init=False, repr=False)
|
| 152 |
+
|
| 153 |
+
# Internal attributes
|
| 154 |
+
|
| 155 |
+
# set to "lfs" or "regular" once known
|
| 156 |
+
_upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None)
|
| 157 |
+
|
| 158 |
+
# set to True if .gitignore rules prevent the file from being uploaded as LFS
|
| 159 |
+
# (server-side check)
|
| 160 |
+
_should_ignore: Optional[bool] = field(init=False, repr=False, default=None)
|
| 161 |
+
|
| 162 |
+
# set to the remote OID of the file if it has already been uploaded
|
| 163 |
+
# useful to determine if a commit will be empty or not
|
| 164 |
+
_remote_oid: Optional[str] = field(init=False, repr=False, default=None)
|
| 165 |
+
|
| 166 |
+
# set to True once the file has been uploaded as LFS
|
| 167 |
+
_is_uploaded: bool = field(init=False, repr=False, default=False)
|
| 168 |
+
|
| 169 |
+
# set to True once the file has been committed
|
| 170 |
+
_is_committed: bool = field(init=False, repr=False, default=False)
|
| 171 |
+
|
| 172 |
+
def __post_init__(self) -> None:
|
| 173 |
+
"""Validates `path_or_fileobj` and compute `upload_info`."""
|
| 174 |
+
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
| 175 |
+
|
| 176 |
+
# Validate `path_or_fileobj` value
|
| 177 |
+
if isinstance(self.path_or_fileobj, Path):
|
| 178 |
+
self.path_or_fileobj = str(self.path_or_fileobj)
|
| 179 |
+
if isinstance(self.path_or_fileobj, str):
|
| 180 |
+
path_or_fileobj = os.path.normpath(os.path.expanduser(self.path_or_fileobj))
|
| 181 |
+
if not os.path.isfile(path_or_fileobj):
|
| 182 |
+
raise ValueError(f"Provided path: '{path_or_fileobj}' is not a file on the local file system")
|
| 183 |
+
elif not isinstance(self.path_or_fileobj, (io.BufferedIOBase, bytes)):
|
| 184 |
+
# ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode
|
| 185 |
+
raise ValueError(
|
| 186 |
+
"path_or_fileobj must be either an instance of str, bytes or"
|
| 187 |
+
" io.BufferedIOBase. If you passed a file-like object, make sure it is"
|
| 188 |
+
" in binary mode."
|
| 189 |
+
)
|
| 190 |
+
if isinstance(self.path_or_fileobj, io.BufferedIOBase):
|
| 191 |
+
try:
|
| 192 |
+
self.path_or_fileobj.tell()
|
| 193 |
+
self.path_or_fileobj.seek(0, os.SEEK_CUR)
|
| 194 |
+
except (OSError, AttributeError) as exc:
|
| 195 |
+
raise ValueError(
|
| 196 |
+
"path_or_fileobj is a file-like object but does not implement seek() and tell()"
|
| 197 |
+
) from exc
|
| 198 |
+
|
| 199 |
+
# Compute "upload_info" attribute
|
| 200 |
+
if isinstance(self.path_or_fileobj, str):
|
| 201 |
+
self.upload_info = UploadInfo.from_path(self.path_or_fileobj)
|
| 202 |
+
elif isinstance(self.path_or_fileobj, bytes):
|
| 203 |
+
self.upload_info = UploadInfo.from_bytes(self.path_or_fileobj)
|
| 204 |
+
else:
|
| 205 |
+
self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)
|
| 206 |
+
|
| 207 |
+
@contextmanager
|
| 208 |
+
def as_file(self, with_tqdm: bool = False) -> Iterator[BinaryIO]:
|
| 209 |
+
"""
|
| 210 |
+
A context manager that yields a file-like object allowing to read the underlying
|
| 211 |
+
data behind `path_or_fileobj`.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
with_tqdm (`bool`, *optional*, defaults to `False`):
|
| 215 |
+
If True, iterating over the file object will display a progress bar. Only
|
| 216 |
+
works if the file-like object is a path to a file. Pure bytes and buffers
|
| 217 |
+
are not supported.
|
| 218 |
+
|
| 219 |
+
Example:
|
| 220 |
+
|
| 221 |
+
```python
|
| 222 |
+
>>> operation = CommitOperationAdd(
|
| 223 |
+
... path_in_repo="remote/dir/weights.h5",
|
| 224 |
+
... path_or_fileobj="./local/weights.h5",
|
| 225 |
+
... )
|
| 226 |
+
CommitOperationAdd(path_in_repo='remote/dir/weights.h5', path_or_fileobj='./local/weights.h5')
|
| 227 |
+
|
| 228 |
+
>>> with operation.as_file() as file:
|
| 229 |
+
... content = file.read()
|
| 230 |
+
|
| 231 |
+
>>> with operation.as_file(with_tqdm=True) as file:
|
| 232 |
+
... while True:
|
| 233 |
+
... data = file.read(1024)
|
| 234 |
+
... if not data:
|
| 235 |
+
... break
|
| 236 |
+
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
| 237 |
+
|
| 238 |
+
>>> with operation.as_file(with_tqdm=True) as file:
|
| 239 |
+
... requests.put(..., data=file)
|
| 240 |
+
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
| 241 |
+
```
|
| 242 |
+
"""
|
| 243 |
+
if isinstance(self.path_or_fileobj, str) or isinstance(self.path_or_fileobj, Path):
|
| 244 |
+
if with_tqdm:
|
| 245 |
+
with tqdm_stream_file(self.path_or_fileobj) as file:
|
| 246 |
+
yield file
|
| 247 |
+
else:
|
| 248 |
+
with open(self.path_or_fileobj, "rb") as file:
|
| 249 |
+
yield file
|
| 250 |
+
elif isinstance(self.path_or_fileobj, bytes):
|
| 251 |
+
yield io.BytesIO(self.path_or_fileobj)
|
| 252 |
+
elif isinstance(self.path_or_fileobj, io.BufferedIOBase):
|
| 253 |
+
prev_pos = self.path_or_fileobj.tell()
|
| 254 |
+
yield self.path_or_fileobj
|
| 255 |
+
self.path_or_fileobj.seek(prev_pos, io.SEEK_SET)
|
| 256 |
+
|
| 257 |
+
def b64content(self) -> bytes:
|
| 258 |
+
"""
|
| 259 |
+
The base64-encoded content of `path_or_fileobj`
|
| 260 |
+
|
| 261 |
+
Returns: `bytes`
|
| 262 |
+
"""
|
| 263 |
+
with self.as_file() as file:
|
| 264 |
+
return base64.b64encode(file.read())
|
| 265 |
+
|
| 266 |
+
@property
|
| 267 |
+
def _local_oid(self) -> Optional[str]:
|
| 268 |
+
"""Return the OID of the local file.
|
| 269 |
+
|
| 270 |
+
This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
|
| 271 |
+
If the file did not change, we won't upload it again to prevent empty commits.
|
| 272 |
+
|
| 273 |
+
For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
|
| 274 |
+
For regular files, the OID corresponds to the SHA1 of the file content.
|
| 275 |
+
Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
|
| 276 |
+
pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
|
| 277 |
+
and more convenient client-side.
|
| 278 |
+
"""
|
| 279 |
+
if self._upload_mode is None:
|
| 280 |
+
return None
|
| 281 |
+
elif self._upload_mode == "lfs":
|
| 282 |
+
return self.upload_info.sha256.hex()
|
| 283 |
+
else:
|
| 284 |
+
# Regular file => compute sha1
|
| 285 |
+
# => no need to read by chunk since the file is guaranteed to be <=5MB.
|
| 286 |
+
with self.as_file() as file:
|
| 287 |
+
return sha.git_hash(file.read())
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def _validate_path_in_repo(path_in_repo: str) -> str:
|
| 291 |
+
# Validate `path_in_repo` value to prevent a server-side issue
|
| 292 |
+
if path_in_repo.startswith("/"):
|
| 293 |
+
path_in_repo = path_in_repo[1:]
|
| 294 |
+
if path_in_repo == "." or path_in_repo == ".." or path_in_repo.startswith("../"):
|
| 295 |
+
raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
|
| 296 |
+
if path_in_repo.startswith("./"):
|
| 297 |
+
path_in_repo = path_in_repo[2:]
|
| 298 |
+
for forbidden in FORBIDDEN_FOLDERS:
|
| 299 |
+
if any(part == forbidden for part in path_in_repo.split("/")):
|
| 300 |
+
raise ValueError(
|
| 301 |
+
f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:"
|
| 302 |
+
f" '{path_in_repo}')."
|
| 303 |
+
)
|
| 304 |
+
return path_in_repo
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
|
| 311 |
+
"""
|
| 312 |
+
Warn user when a list of operations is expected to overwrite itself in a single
|
| 313 |
+
commit.
|
| 314 |
+
|
| 315 |
+
Rules:
|
| 316 |
+
- If a filepath is updated by multiple `CommitOperationAdd` operations, a warning
|
| 317 |
+
message is triggered.
|
| 318 |
+
- If a filepath is updated at least once by a `CommitOperationAdd` and then deleted
|
| 319 |
+
by a `CommitOperationDelete`, a warning is triggered.
|
| 320 |
+
- If a `CommitOperationDelete` deletes a filepath that is then updated by a
|
| 321 |
+
`CommitOperationAdd`, no warning is triggered. This is usually useless (no need to
|
| 322 |
+
delete before upload) but can happen if a user deletes an entire folder and then
|
| 323 |
+
add new files to it.
|
| 324 |
+
"""
|
| 325 |
+
nb_additions_per_path: Dict[str, int] = defaultdict(int)
|
| 326 |
+
for operation in operations:
|
| 327 |
+
path_in_repo = operation.path_in_repo
|
| 328 |
+
if isinstance(operation, CommitOperationAdd):
|
| 329 |
+
if nb_additions_per_path[path_in_repo] > 0:
|
| 330 |
+
warnings.warn(
|
| 331 |
+
"About to update multiple times the same file in the same commit:"
|
| 332 |
+
f" '{path_in_repo}'. This can cause undesired inconsistencies in"
|
| 333 |
+
" your repo."
|
| 334 |
+
)
|
| 335 |
+
nb_additions_per_path[path_in_repo] += 1
|
| 336 |
+
for parent in PurePosixPath(path_in_repo).parents:
|
| 337 |
+
# Also keep track of number of updated files per folder
|
| 338 |
+
# => warns if deleting a folder overwrite some contained files
|
| 339 |
+
nb_additions_per_path[str(parent)] += 1
|
| 340 |
+
if isinstance(operation, CommitOperationDelete):
|
| 341 |
+
if nb_additions_per_path[str(PurePosixPath(path_in_repo))] > 0:
|
| 342 |
+
if operation.is_folder:
|
| 343 |
+
warnings.warn(
|
| 344 |
+
"About to delete a folder containing files that have just been"
|
| 345 |
+
f" updated within the same commit: '{path_in_repo}'. This can"
|
| 346 |
+
" cause undesired inconsistencies in your repo."
|
| 347 |
+
)
|
| 348 |
+
else:
|
| 349 |
+
warnings.warn(
|
| 350 |
+
"About to delete a file that have just been updated within the"
|
| 351 |
+
f" same commit: '{path_in_repo}'. This can cause undesired"
|
| 352 |
+
" inconsistencies in your repo."
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
@validate_hf_hub_args
|
| 357 |
+
def _upload_files(
|
| 358 |
+
*,
|
| 359 |
+
additions: List[CommitOperationAdd],
|
| 360 |
+
repo_type: str,
|
| 361 |
+
repo_id: str,
|
| 362 |
+
headers: Dict[str, str],
|
| 363 |
+
endpoint: Optional[str] = None,
|
| 364 |
+
num_threads: int = 5,
|
| 365 |
+
revision: Optional[str] = None,
|
| 366 |
+
create_pr: Optional[bool] = None,
|
| 367 |
+
):
|
| 368 |
+
"""
|
| 369 |
+
Negotiates per-file transfer (LFS vs Xet) and uploads in batches.
|
| 370 |
+
"""
|
| 371 |
+
xet_additions: List[CommitOperationAdd] = []
|
| 372 |
+
lfs_actions: List[Dict] = []
|
| 373 |
+
lfs_oid2addop: Dict[str, CommitOperationAdd] = {}
|
| 374 |
+
|
| 375 |
+
for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
|
| 376 |
+
chunk_list = [op for op in chunk]
|
| 377 |
+
|
| 378 |
+
transfers: List[str] = ["basic", "multipart"]
|
| 379 |
+
has_buffered_io_data = any(isinstance(op.path_or_fileobj, io.BufferedIOBase) for op in chunk_list)
|
| 380 |
+
if is_xet_available():
|
| 381 |
+
if not has_buffered_io_data:
|
| 382 |
+
transfers.append("xet")
|
| 383 |
+
else:
|
| 384 |
+
logger.warning(
|
| 385 |
+
"Uploading files as a binary IO buffer is not supported by Xet Storage. "
|
| 386 |
+
"Falling back to HTTP upload."
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
actions_chunk, errors_chunk, chosen_transfer = post_lfs_batch_info(
|
| 390 |
+
upload_infos=[op.upload_info for op in chunk_list],
|
| 391 |
+
repo_id=repo_id,
|
| 392 |
+
repo_type=repo_type,
|
| 393 |
+
revision=revision,
|
| 394 |
+
endpoint=endpoint,
|
| 395 |
+
headers=headers,
|
| 396 |
+
token=None, # already passed in 'headers'
|
| 397 |
+
transfers=transfers,
|
| 398 |
+
)
|
| 399 |
+
if errors_chunk:
|
| 400 |
+
message = "\n".join(
|
| 401 |
+
[
|
| 402 |
+
f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
|
| 403 |
+
for err in errors_chunk
|
| 404 |
+
]
|
| 405 |
+
)
|
| 406 |
+
raise ValueError(f"LFS batch API returned errors:\n{message}")
|
| 407 |
+
|
| 408 |
+
# If server returns a transfer we didn't offer (e.g "xet" while uploading from BytesIO),
|
| 409 |
+
# fall back to LFS for this chunk.
|
| 410 |
+
if chosen_transfer == "xet" and ("xet" in transfers):
|
| 411 |
+
xet_additions.extend(chunk_list)
|
| 412 |
+
else:
|
| 413 |
+
lfs_actions.extend(actions_chunk)
|
| 414 |
+
for op in chunk_list:
|
| 415 |
+
lfs_oid2addop[op.upload_info.sha256.hex()] = op
|
| 416 |
+
|
| 417 |
+
if len(lfs_actions) > 0:
|
| 418 |
+
_upload_lfs_files(
|
| 419 |
+
actions=lfs_actions,
|
| 420 |
+
oid2addop=lfs_oid2addop,
|
| 421 |
+
headers=headers,
|
| 422 |
+
endpoint=endpoint,
|
| 423 |
+
num_threads=num_threads,
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
if len(xet_additions) > 0:
|
| 427 |
+
_upload_xet_files(
|
| 428 |
+
additions=xet_additions,
|
| 429 |
+
repo_type=repo_type,
|
| 430 |
+
repo_id=repo_id,
|
| 431 |
+
headers=headers,
|
| 432 |
+
endpoint=endpoint,
|
| 433 |
+
revision=revision,
|
| 434 |
+
create_pr=create_pr,
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
@validate_hf_hub_args
|
| 439 |
+
def _upload_lfs_files(
|
| 440 |
+
*,
|
| 441 |
+
actions: List[Dict],
|
| 442 |
+
oid2addop: Dict[str, CommitOperationAdd],
|
| 443 |
+
headers: Dict[str, str],
|
| 444 |
+
endpoint: Optional[str] = None,
|
| 445 |
+
num_threads: int = 5,
|
| 446 |
+
):
|
| 447 |
+
"""
|
| 448 |
+
Uploads the content of `additions` to the Hub using the large file storage protocol.
|
| 449 |
+
|
| 450 |
+
Relevant external documentation:
|
| 451 |
+
- LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
|
| 452 |
+
|
| 453 |
+
Args:
|
| 454 |
+
actions (`List[Dict]`):
|
| 455 |
+
LFS batch actions returned by the server.
|
| 456 |
+
oid2addop (`Dict[str, CommitOperationAdd]`):
|
| 457 |
+
A dictionary mapping the OID of the file to the corresponding `CommitOperationAdd` object.
|
| 458 |
+
headers (`Dict[str, str]`):
|
| 459 |
+
Headers to use for the request, including authorization headers and user agent.
|
| 460 |
+
endpoint (`str`, *optional*):
|
| 461 |
+
The endpoint to use for the request. Defaults to `constants.ENDPOINT`.
|
| 462 |
+
num_threads (`int`, *optional*):
|
| 463 |
+
The number of concurrent threads to use when uploading. Defaults to 5.
|
| 464 |
+
|
| 465 |
+
Raises:
|
| 466 |
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
| 467 |
+
If an upload failed for any reason
|
| 468 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 469 |
+
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
| 470 |
+
repo_id (`str`):
|
| 471 |
+
A namespace (user or an organization) and a repo name separated
|
| 472 |
+
by a `/`.
|
| 473 |
+
headers (`Dict[str, str]`):
|
| 474 |
+
Headers to use for the request, including authorization headers and user agent.
|
| 475 |
+
num_threads (`int`, *optional*):
|
| 476 |
+
The number of concurrent threads to use when uploading. Defaults to 5.
|
| 477 |
+
revision (`str`, *optional*):
|
| 478 |
+
The git revision to upload to.
|
| 479 |
+
|
| 480 |
+
Raises:
|
| 481 |
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
| 482 |
+
If an upload failed for any reason
|
| 483 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 484 |
+
If the server returns malformed responses
|
| 485 |
+
[`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
|
| 486 |
+
If the LFS batch endpoint returned an HTTP error.
|
| 487 |
+
"""
|
| 488 |
+
# Filter out files already present upstream
|
| 489 |
+
filtered_actions = []
|
| 490 |
+
for action in actions:
|
| 491 |
+
if action.get("actions") is None:
|
| 492 |
+
logger.debug(
|
| 493 |
+
f"Content of file {oid2addop[action['oid']].path_in_repo} is already present upstream - skipping upload."
|
| 494 |
+
)
|
| 495 |
+
else:
|
| 496 |
+
filtered_actions.append(action)
|
| 497 |
+
|
| 498 |
+
# Upload according to server-provided actions
|
| 499 |
+
def _wrapped_lfs_upload(batch_action) -> None:
|
| 500 |
+
try:
|
| 501 |
+
operation = oid2addop[batch_action["oid"]]
|
| 502 |
+
lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers, endpoint=endpoint)
|
| 503 |
+
except Exception as exc:
|
| 504 |
+
raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
|
| 505 |
+
|
| 506 |
+
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
| 507 |
+
logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
|
| 508 |
+
for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
|
| 509 |
+
_wrapped_lfs_upload(action)
|
| 510 |
+
elif len(filtered_actions) == 1:
|
| 511 |
+
logger.debug("Uploading 1 LFS file to the Hub")
|
| 512 |
+
_wrapped_lfs_upload(filtered_actions[0])
|
| 513 |
+
else:
|
| 514 |
+
logger.debug(
|
| 515 |
+
f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
|
| 516 |
+
)
|
| 517 |
+
thread_map(
|
| 518 |
+
_wrapped_lfs_upload,
|
| 519 |
+
filtered_actions,
|
| 520 |
+
desc=f"Upload {len(filtered_actions)} LFS files",
|
| 521 |
+
max_workers=num_threads,
|
| 522 |
+
tqdm_class=hf_tqdm,
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
@validate_hf_hub_args
|
| 527 |
+
def _upload_xet_files(
|
| 528 |
+
*,
|
| 529 |
+
additions: List[CommitOperationAdd],
|
| 530 |
+
repo_type: str,
|
| 531 |
+
repo_id: str,
|
| 532 |
+
headers: Dict[str, str],
|
| 533 |
+
endpoint: Optional[str] = None,
|
| 534 |
+
revision: Optional[str] = None,
|
| 535 |
+
create_pr: Optional[bool] = None,
|
| 536 |
+
):
|
| 537 |
+
"""
|
| 538 |
+
Uploads the content of `additions` to the Hub using the xet storage protocol.
|
| 539 |
+
This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
|
| 540 |
+
|
| 541 |
+
Args:
|
| 542 |
+
additions (`List` of `CommitOperationAdd`):
|
| 543 |
+
The files to be uploaded.
|
| 544 |
+
repo_type (`str`):
|
| 545 |
+
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
| 546 |
+
repo_id (`str`):
|
| 547 |
+
A namespace (user or an organization) and a repo name separated
|
| 548 |
+
by a `/`.
|
| 549 |
+
headers (`Dict[str, str]`):
|
| 550 |
+
Headers to use for the request, including authorization headers and user agent.
|
| 551 |
+
endpoint: (`str`, *optional*):
|
| 552 |
+
The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
|
| 553 |
+
revision (`str`, *optional*):
|
| 554 |
+
The git revision to upload to.
|
| 555 |
+
create_pr (`bool`, *optional*):
|
| 556 |
+
Whether or not to create a Pull Request with that commit.
|
| 557 |
+
|
| 558 |
+
Raises:
|
| 559 |
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
| 560 |
+
If an upload failed for any reason.
|
| 561 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 562 |
+
If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
|
| 563 |
+
[`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
|
| 564 |
+
If the LFS batch endpoint returned an HTTP error.
|
| 565 |
+
|
| 566 |
+
**How it works:**
|
| 567 |
+
The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
|
| 568 |
+
for efficient storage and transfer.
|
| 569 |
+
|
| 570 |
+
`hf_xet.upload_files` manages uploading files by:
|
| 571 |
+
- Taking a list of file paths to upload
|
| 572 |
+
- Breaking files into smaller chunks for efficient storage
|
| 573 |
+
- Avoiding duplicate storage by recognizing identical chunks across files
|
| 574 |
+
- Connecting to a storage server (CAS server) that manages these chunks
|
| 575 |
+
|
| 576 |
+
The upload process works like this:
|
| 577 |
+
1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
|
| 578 |
+
2. Process files in parallel (up to 8 files at once):
|
| 579 |
+
2.1. Read the file content.
|
| 580 |
+
2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
|
| 581 |
+
2.3. For each chunk:
|
| 582 |
+
- Check if it already exists in storage.
|
| 583 |
+
- Skip uploading chunks that already exist.
|
| 584 |
+
2.4. Group chunks into larger blocks for efficient transfer.
|
| 585 |
+
2.5. Upload these blocks to the storage server.
|
| 586 |
+
2.6. Create and upload information about how the file is structured.
|
| 587 |
+
3. Return reference files that contain information about the uploaded files, which can be used later to download them.
|
| 588 |
+
"""
|
| 589 |
+
if len(additions) == 0:
|
| 590 |
+
return
|
| 591 |
+
|
| 592 |
+
# at this point, we know that hf_xet is installed
|
| 593 |
+
from hf_xet import upload_bytes, upload_files
|
| 594 |
+
|
| 595 |
+
from .utils._xet_progress_reporting import XetProgressReporter
|
| 596 |
+
|
| 597 |
+
try:
|
| 598 |
+
xet_connection_info = fetch_xet_connection_info_from_repo_info(
|
| 599 |
+
token_type=XetTokenType.WRITE,
|
| 600 |
+
repo_id=repo_id,
|
| 601 |
+
repo_type=repo_type,
|
| 602 |
+
revision=revision,
|
| 603 |
+
headers=headers,
|
| 604 |
+
endpoint=endpoint,
|
| 605 |
+
params={"create_pr": "1"} if create_pr else None,
|
| 606 |
+
)
|
| 607 |
+
except HfHubHTTPError as e:
|
| 608 |
+
if e.response.status_code == 401:
|
| 609 |
+
raise XetAuthorizationError(
|
| 610 |
+
f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
|
| 611 |
+
f"Please check that you have configured your access token with write access to the repo."
|
| 612 |
+
) from e
|
| 613 |
+
raise
|
| 614 |
+
|
| 615 |
+
xet_endpoint = xet_connection_info.endpoint
|
| 616 |
+
access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
|
| 617 |
+
|
| 618 |
+
def token_refresher() -> Tuple[str, int]:
|
| 619 |
+
new_xet_connection = fetch_xet_connection_info_from_repo_info(
|
| 620 |
+
token_type=XetTokenType.WRITE,
|
| 621 |
+
repo_id=repo_id,
|
| 622 |
+
repo_type=repo_type,
|
| 623 |
+
revision=revision,
|
| 624 |
+
headers=headers,
|
| 625 |
+
endpoint=endpoint,
|
| 626 |
+
params={"create_pr": "1"} if create_pr else None,
|
| 627 |
+
)
|
| 628 |
+
if new_xet_connection is None:
|
| 629 |
+
raise XetRefreshTokenError("Failed to refresh xet token")
|
| 630 |
+
return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
|
| 631 |
+
|
| 632 |
+
if not are_progress_bars_disabled():
|
| 633 |
+
progress = XetProgressReporter()
|
| 634 |
+
progress_callback = progress.update_progress
|
| 635 |
+
else:
|
| 636 |
+
progress, progress_callback = None, None
|
| 637 |
+
|
| 638 |
+
try:
|
| 639 |
+
all_bytes_ops = [op for op in additions if isinstance(op.path_or_fileobj, bytes)]
|
| 640 |
+
all_paths_ops = [op for op in additions if isinstance(op.path_or_fileobj, (str, Path))]
|
| 641 |
+
|
| 642 |
+
if len(all_paths_ops) > 0:
|
| 643 |
+
all_paths = [str(op.path_or_fileobj) for op in all_paths_ops]
|
| 644 |
+
upload_files(
|
| 645 |
+
all_paths,
|
| 646 |
+
xet_endpoint,
|
| 647 |
+
access_token_info,
|
| 648 |
+
token_refresher,
|
| 649 |
+
progress_callback,
|
| 650 |
+
repo_type,
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
if len(all_bytes_ops) > 0:
|
| 654 |
+
all_bytes = [op.path_or_fileobj for op in all_bytes_ops]
|
| 655 |
+
upload_bytes(
|
| 656 |
+
all_bytes,
|
| 657 |
+
xet_endpoint,
|
| 658 |
+
access_token_info,
|
| 659 |
+
token_refresher,
|
| 660 |
+
progress_callback,
|
| 661 |
+
repo_type,
|
| 662 |
+
)
|
| 663 |
+
|
| 664 |
+
finally:
|
| 665 |
+
if progress is not None:
|
| 666 |
+
progress.close(False)
|
| 667 |
+
|
| 668 |
+
return
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
def _validate_preupload_info(preupload_info: dict):
|
| 672 |
+
files = preupload_info.get("files")
|
| 673 |
+
if not isinstance(files, list):
|
| 674 |
+
raise ValueError("preupload_info is improperly formatted")
|
| 675 |
+
for file_info in files:
|
| 676 |
+
if not (
|
| 677 |
+
isinstance(file_info, dict)
|
| 678 |
+
and isinstance(file_info.get("path"), str)
|
| 679 |
+
and isinstance(file_info.get("uploadMode"), str)
|
| 680 |
+
and (file_info["uploadMode"] in ("lfs", "regular"))
|
| 681 |
+
):
|
| 682 |
+
raise ValueError("preupload_info is improperly formatted:")
|
| 683 |
+
return preupload_info
|
| 684 |
+
|
| 685 |
+
|
| 686 |
+
@validate_hf_hub_args
|
| 687 |
+
def _fetch_upload_modes(
|
| 688 |
+
additions: Iterable[CommitOperationAdd],
|
| 689 |
+
repo_type: str,
|
| 690 |
+
repo_id: str,
|
| 691 |
+
headers: Dict[str, str],
|
| 692 |
+
revision: str,
|
| 693 |
+
endpoint: Optional[str] = None,
|
| 694 |
+
create_pr: bool = False,
|
| 695 |
+
gitignore_content: Optional[str] = None,
|
| 696 |
+
) -> None:
|
| 697 |
+
"""
|
| 698 |
+
Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
|
| 699 |
+
as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
|
| 700 |
+
|
| 701 |
+
Args:
|
| 702 |
+
additions (`Iterable` of :class:`CommitOperationAdd`):
|
| 703 |
+
Iterable of :class:`CommitOperationAdd` describing the files to
|
| 704 |
+
upload to the Hub.
|
| 705 |
+
repo_type (`str`):
|
| 706 |
+
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
| 707 |
+
repo_id (`str`):
|
| 708 |
+
A namespace (user or an organization) and a repo name separated
|
| 709 |
+
by a `/`.
|
| 710 |
+
headers (`Dict[str, str]`):
|
| 711 |
+
Headers to use for the request, including authorization headers and user agent.
|
| 712 |
+
revision (`str`):
|
| 713 |
+
The git revision to upload the files to. Can be any valid git revision.
|
| 714 |
+
gitignore_content (`str`, *optional*):
|
| 715 |
+
The content of the `.gitignore` file to know which files should be ignored. The order of priority
|
| 716 |
+
is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
|
| 717 |
+
in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
|
| 718 |
+
(if any).
|
| 719 |
+
Raises:
|
| 720 |
+
[`~utils.HfHubHTTPError`]
|
| 721 |
+
If the Hub API returned an error.
|
| 722 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 723 |
+
If the Hub API response is improperly formatted.
|
| 724 |
+
"""
|
| 725 |
+
endpoint = endpoint if endpoint is not None else constants.ENDPOINT
|
| 726 |
+
|
| 727 |
+
# Fetch upload mode (LFS or regular) chunk by chunk.
|
| 728 |
+
upload_modes: Dict[str, UploadMode] = {}
|
| 729 |
+
should_ignore_info: Dict[str, bool] = {}
|
| 730 |
+
oid_info: Dict[str, Optional[str]] = {}
|
| 731 |
+
|
| 732 |
+
for chunk in chunk_iterable(additions, 256):
|
| 733 |
+
payload: Dict = {
|
| 734 |
+
"files": [
|
| 735 |
+
{
|
| 736 |
+
"path": op.path_in_repo,
|
| 737 |
+
"sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
|
| 738 |
+
"size": op.upload_info.size,
|
| 739 |
+
}
|
| 740 |
+
for op in chunk
|
| 741 |
+
]
|
| 742 |
+
}
|
| 743 |
+
if gitignore_content is not None:
|
| 744 |
+
payload["gitIgnore"] = gitignore_content
|
| 745 |
+
|
| 746 |
+
resp = get_session().post(
|
| 747 |
+
f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
|
| 748 |
+
json=payload,
|
| 749 |
+
headers=headers,
|
| 750 |
+
params={"create_pr": "1"} if create_pr else None,
|
| 751 |
+
)
|
| 752 |
+
hf_raise_for_status(resp)
|
| 753 |
+
preupload_info = _validate_preupload_info(resp.json())
|
| 754 |
+
upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
|
| 755 |
+
should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
|
| 756 |
+
oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})
|
| 757 |
+
|
| 758 |
+
# Set upload mode for each addition operation
|
| 759 |
+
for addition in additions:
|
| 760 |
+
addition._upload_mode = upload_modes[addition.path_in_repo]
|
| 761 |
+
addition._should_ignore = should_ignore_info[addition.path_in_repo]
|
| 762 |
+
addition._remote_oid = oid_info[addition.path_in_repo]
|
| 763 |
+
|
| 764 |
+
# Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
|
| 765 |
+
# => empty files are uploaded as "regular" to still allow users to commit them.
|
| 766 |
+
for addition in additions:
|
| 767 |
+
if addition.upload_info.size == 0:
|
| 768 |
+
addition._upload_mode = "regular"
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
@validate_hf_hub_args
|
| 772 |
+
def _fetch_files_to_copy(
|
| 773 |
+
copies: Iterable[CommitOperationCopy],
|
| 774 |
+
repo_type: str,
|
| 775 |
+
repo_id: str,
|
| 776 |
+
headers: Dict[str, str],
|
| 777 |
+
revision: str,
|
| 778 |
+
endpoint: Optional[str] = None,
|
| 779 |
+
) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
|
| 780 |
+
"""
|
| 781 |
+
Fetch information about the files to copy.
|
| 782 |
+
|
| 783 |
+
For LFS files, we only need their metadata (file size and sha256) while for regular files
|
| 784 |
+
we need to download the raw content from the Hub.
|
| 785 |
+
|
| 786 |
+
Args:
|
| 787 |
+
copies (`Iterable` of :class:`CommitOperationCopy`):
|
| 788 |
+
Iterable of :class:`CommitOperationCopy` describing the files to
|
| 789 |
+
copy on the Hub.
|
| 790 |
+
repo_type (`str`):
|
| 791 |
+
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
| 792 |
+
repo_id (`str`):
|
| 793 |
+
A namespace (user or an organization) and a repo name separated
|
| 794 |
+
by a `/`.
|
| 795 |
+
headers (`Dict[str, str]`):
|
| 796 |
+
Headers to use for the request, including authorization headers and user agent.
|
| 797 |
+
revision (`str`):
|
| 798 |
+
The git revision to upload the files to. Can be any valid git revision.
|
| 799 |
+
|
| 800 |
+
Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
|
| 801 |
+
Key is the file path and revision of the file to copy.
|
| 802 |
+
Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
|
| 803 |
+
|
| 804 |
+
Raises:
|
| 805 |
+
[`~utils.HfHubHTTPError`]
|
| 806 |
+
If the Hub API returned an error.
|
| 807 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 808 |
+
If the Hub API response is improperly formatted.
|
| 809 |
+
"""
|
| 810 |
+
from .hf_api import HfApi, RepoFolder
|
| 811 |
+
|
| 812 |
+
hf_api = HfApi(endpoint=endpoint, headers=headers)
|
| 813 |
+
files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
|
| 814 |
+
# Store (path, revision) -> oid mapping
|
| 815 |
+
oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
|
| 816 |
+
# 1. Fetch OIDs for destination paths in batches.
|
| 817 |
+
dest_paths = [op.path_in_repo for op in copies]
|
| 818 |
+
for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
|
| 819 |
+
dest_repo_files = hf_api.get_paths_info(
|
| 820 |
+
repo_id=repo_id,
|
| 821 |
+
paths=dest_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
|
| 822 |
+
revision=revision,
|
| 823 |
+
repo_type=repo_type,
|
| 824 |
+
)
|
| 825 |
+
for file in dest_repo_files:
|
| 826 |
+
if not isinstance(file, RepoFolder):
|
| 827 |
+
oid_info[(file.path, revision)] = file.blob_id
|
| 828 |
+
|
| 829 |
+
# 2. Group by source revision and fetch source file info in batches.
|
| 830 |
+
for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
|
| 831 |
+
operations = list(operations) # type: ignore
|
| 832 |
+
src_paths = [op.src_path_in_repo for op in operations]
|
| 833 |
+
for offset in range(0, len(src_paths), FETCH_LFS_BATCH_SIZE):
|
| 834 |
+
src_repo_files = hf_api.get_paths_info(
|
| 835 |
+
repo_id=repo_id,
|
| 836 |
+
paths=src_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
|
| 837 |
+
revision=src_revision or revision,
|
| 838 |
+
repo_type=repo_type,
|
| 839 |
+
)
|
| 840 |
+
|
| 841 |
+
for src_repo_file in src_repo_files:
|
| 842 |
+
if isinstance(src_repo_file, RepoFolder):
|
| 843 |
+
raise NotImplementedError("Copying a folder is not implemented.")
|
| 844 |
+
oid_info[(src_repo_file.path, src_revision)] = src_repo_file.blob_id
|
| 845 |
+
# If it's an LFS file, store the RepoFile object. Otherwise, download raw bytes.
|
| 846 |
+
if src_repo_file.lfs:
|
| 847 |
+
files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
|
| 848 |
+
else:
|
| 849 |
+
# TODO: (optimization) download regular files to copy concurrently
|
| 850 |
+
url = hf_hub_url(
|
| 851 |
+
endpoint=endpoint,
|
| 852 |
+
repo_type=repo_type,
|
| 853 |
+
repo_id=repo_id,
|
| 854 |
+
revision=src_revision or revision,
|
| 855 |
+
filename=src_repo_file.path,
|
| 856 |
+
)
|
| 857 |
+
response = get_session().get(url, headers=headers)
|
| 858 |
+
hf_raise_for_status(response)
|
| 859 |
+
files_to_copy[(src_repo_file.path, src_revision)] = response.content
|
| 860 |
+
# 3. Ensure all operations found a corresponding file in the Hub
|
| 861 |
+
# and track src/dest OIDs for each operation.
|
| 862 |
+
for operation in operations:
|
| 863 |
+
if (operation.src_path_in_repo, src_revision) not in files_to_copy:
|
| 864 |
+
raise EntryNotFoundError(
|
| 865 |
+
f"Cannot copy {operation.src_path_in_repo} at revision "
|
| 866 |
+
f"{src_revision or revision}: file is missing on repo."
|
| 867 |
+
)
|
| 868 |
+
operation._src_oid = oid_info.get((operation.src_path_in_repo, operation.src_revision))
|
| 869 |
+
operation._dest_oid = oid_info.get((operation.path_in_repo, revision))
|
| 870 |
+
return files_to_copy
|
| 871 |
+
|
| 872 |
+
|
| 873 |
+
def _prepare_commit_payload(
|
| 874 |
+
operations: Iterable[CommitOperation],
|
| 875 |
+
files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
|
| 876 |
+
commit_message: str,
|
| 877 |
+
commit_description: Optional[str] = None,
|
| 878 |
+
parent_commit: Optional[str] = None,
|
| 879 |
+
) -> Iterable[Dict[str, Any]]:
|
| 880 |
+
"""
|
| 881 |
+
Builds the payload to POST to the `/commit` API of the Hub.
|
| 882 |
+
|
| 883 |
+
Payload is returned as an iterator so that it can be streamed as a ndjson in the
|
| 884 |
+
POST request.
|
| 885 |
+
|
| 886 |
+
For more information, see:
|
| 887 |
+
- https://github.com/huggingface/huggingface_hub/issues/1085#issuecomment-1265208073
|
| 888 |
+
- http://ndjson.org/
|
| 889 |
+
"""
|
| 890 |
+
commit_description = commit_description if commit_description is not None else ""
|
| 891 |
+
|
| 892 |
+
# 1. Send a header item with the commit metadata
|
| 893 |
+
header_value = {"summary": commit_message, "description": commit_description}
|
| 894 |
+
if parent_commit is not None:
|
| 895 |
+
header_value["parentCommit"] = parent_commit
|
| 896 |
+
yield {"key": "header", "value": header_value}
|
| 897 |
+
|
| 898 |
+
nb_ignored_files = 0
|
| 899 |
+
|
| 900 |
+
# 2. Send operations, one per line
|
| 901 |
+
for operation in operations:
|
| 902 |
+
# Skip ignored files
|
| 903 |
+
if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
|
| 904 |
+
logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
|
| 905 |
+
nb_ignored_files += 1
|
| 906 |
+
continue
|
| 907 |
+
|
| 908 |
+
# 2.a. Case adding a regular file
|
| 909 |
+
if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
|
| 910 |
+
yield {
|
| 911 |
+
"key": "file",
|
| 912 |
+
"value": {
|
| 913 |
+
"content": operation.b64content().decode(),
|
| 914 |
+
"path": operation.path_in_repo,
|
| 915 |
+
"encoding": "base64",
|
| 916 |
+
},
|
| 917 |
+
}
|
| 918 |
+
# 2.b. Case adding an LFS file
|
| 919 |
+
elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs":
|
| 920 |
+
yield {
|
| 921 |
+
"key": "lfsFile",
|
| 922 |
+
"value": {
|
| 923 |
+
"path": operation.path_in_repo,
|
| 924 |
+
"algo": "sha256",
|
| 925 |
+
"oid": operation.upload_info.sha256.hex(),
|
| 926 |
+
"size": operation.upload_info.size,
|
| 927 |
+
},
|
| 928 |
+
}
|
| 929 |
+
# 2.c. Case deleting a file or folder
|
| 930 |
+
elif isinstance(operation, CommitOperationDelete):
|
| 931 |
+
yield {
|
| 932 |
+
"key": "deletedFolder" if operation.is_folder else "deletedFile",
|
| 933 |
+
"value": {"path": operation.path_in_repo},
|
| 934 |
+
}
|
| 935 |
+
# 2.d. Case copying a file or folder
|
| 936 |
+
elif isinstance(operation, CommitOperationCopy):
|
| 937 |
+
file_to_copy = files_to_copy[(operation.src_path_in_repo, operation.src_revision)]
|
| 938 |
+
if isinstance(file_to_copy, bytes):
|
| 939 |
+
yield {
|
| 940 |
+
"key": "file",
|
| 941 |
+
"value": {
|
| 942 |
+
"content": base64.b64encode(file_to_copy).decode(),
|
| 943 |
+
"path": operation.path_in_repo,
|
| 944 |
+
"encoding": "base64",
|
| 945 |
+
},
|
| 946 |
+
}
|
| 947 |
+
elif file_to_copy.lfs:
|
| 948 |
+
yield {
|
| 949 |
+
"key": "lfsFile",
|
| 950 |
+
"value": {
|
| 951 |
+
"path": operation.path_in_repo,
|
| 952 |
+
"algo": "sha256",
|
| 953 |
+
"oid": file_to_copy.lfs.sha256,
|
| 954 |
+
},
|
| 955 |
+
}
|
| 956 |
+
else:
|
| 957 |
+
raise ValueError(
|
| 958 |
+
"Malformed files_to_copy (should be raw file content as bytes or RepoFile objects with LFS info."
|
| 959 |
+
)
|
| 960 |
+
# 2.e. Never expected to happen
|
| 961 |
+
else:
|
| 962 |
+
raise ValueError(
|
| 963 |
+
f"Unknown operation to commit. Operation: {operation}. Upload mode:"
|
| 964 |
+
f" {getattr(operation, '_upload_mode', None)}"
|
| 965 |
+
)
|
| 966 |
+
|
| 967 |
+
if nb_ignored_files > 0:
|
| 968 |
+
logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).")
|
venv/lib/python3.13/site-packages/huggingface_hub/_commit_scheduler.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import atexit
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from concurrent.futures import Future
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from io import SEEK_END, SEEK_SET, BytesIO
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from threading import Lock, Thread
|
| 10 |
+
from typing import Dict, List, Optional, Union
|
| 11 |
+
|
| 12 |
+
from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
|
| 13 |
+
from .utils import filter_repo_objects
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass(frozen=True)
|
| 20 |
+
class _FileToUpload:
|
| 21 |
+
"""Temporary dataclass to store info about files to upload. Not meant to be used directly."""
|
| 22 |
+
|
| 23 |
+
local_path: Path
|
| 24 |
+
path_in_repo: str
|
| 25 |
+
size_limit: int
|
| 26 |
+
last_modified: float
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class CommitScheduler:
|
| 30 |
+
"""
|
| 31 |
+
Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
|
| 32 |
+
|
| 33 |
+
The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
|
| 34 |
+
properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
|
| 35 |
+
with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
|
| 36 |
+
to learn more about how to use it.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
repo_id (`str`):
|
| 40 |
+
The id of the repo to commit to.
|
| 41 |
+
folder_path (`str` or `Path`):
|
| 42 |
+
Path to the local folder to upload regularly.
|
| 43 |
+
every (`int` or `float`, *optional*):
|
| 44 |
+
The number of minutes between each commit. Defaults to 5 minutes.
|
| 45 |
+
path_in_repo (`str`, *optional*):
|
| 46 |
+
Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
|
| 47 |
+
of the repository.
|
| 48 |
+
repo_type (`str`, *optional*):
|
| 49 |
+
The type of the repo to commit to. Defaults to `model`.
|
| 50 |
+
revision (`str`, *optional*):
|
| 51 |
+
The revision of the repo to commit to. Defaults to `main`.
|
| 52 |
+
private (`bool`, *optional*):
|
| 53 |
+
Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
|
| 54 |
+
token (`str`, *optional*):
|
| 55 |
+
The token to use to commit to the repo. Defaults to the token saved on the machine.
|
| 56 |
+
allow_patterns (`List[str]` or `str`, *optional*):
|
| 57 |
+
If provided, only files matching at least one pattern are uploaded.
|
| 58 |
+
ignore_patterns (`List[str]` or `str`, *optional*):
|
| 59 |
+
If provided, files matching any of the patterns are not uploaded.
|
| 60 |
+
squash_history (`bool`, *optional*):
|
| 61 |
+
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
|
| 62 |
+
useful to avoid degraded performances on the repo when it grows too large.
|
| 63 |
+
hf_api (`HfApi`, *optional*):
|
| 64 |
+
The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
|
| 65 |
+
|
| 66 |
+
Example:
|
| 67 |
+
```py
|
| 68 |
+
>>> from pathlib import Path
|
| 69 |
+
>>> from huggingface_hub import CommitScheduler
|
| 70 |
+
|
| 71 |
+
# Scheduler uploads every 10 minutes
|
| 72 |
+
>>> csv_path = Path("watched_folder/data.csv")
|
| 73 |
+
>>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
|
| 74 |
+
|
| 75 |
+
>>> with csv_path.open("a") as f:
|
| 76 |
+
... f.write("first line")
|
| 77 |
+
|
| 78 |
+
# Some time later (...)
|
| 79 |
+
>>> with csv_path.open("a") as f:
|
| 80 |
+
... f.write("second line")
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
Example using a context manager:
|
| 84 |
+
```py
|
| 85 |
+
>>> from pathlib import Path
|
| 86 |
+
>>> from huggingface_hub import CommitScheduler
|
| 87 |
+
|
| 88 |
+
>>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
|
| 89 |
+
... csv_path = Path("watched_folder/data.csv")
|
| 90 |
+
... with csv_path.open("a") as f:
|
| 91 |
+
... f.write("first line")
|
| 92 |
+
... (...)
|
| 93 |
+
... with csv_path.open("a") as f:
|
| 94 |
+
... f.write("second line")
|
| 95 |
+
|
| 96 |
+
# Scheduler is now stopped and last commit have been triggered
|
| 97 |
+
```
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(
|
| 101 |
+
self,
|
| 102 |
+
*,
|
| 103 |
+
repo_id: str,
|
| 104 |
+
folder_path: Union[str, Path],
|
| 105 |
+
every: Union[int, float] = 5,
|
| 106 |
+
path_in_repo: Optional[str] = None,
|
| 107 |
+
repo_type: Optional[str] = None,
|
| 108 |
+
revision: Optional[str] = None,
|
| 109 |
+
private: Optional[bool] = None,
|
| 110 |
+
token: Optional[str] = None,
|
| 111 |
+
allow_patterns: Optional[Union[List[str], str]] = None,
|
| 112 |
+
ignore_patterns: Optional[Union[List[str], str]] = None,
|
| 113 |
+
squash_history: bool = False,
|
| 114 |
+
hf_api: Optional["HfApi"] = None,
|
| 115 |
+
) -> None:
|
| 116 |
+
self.api = hf_api or HfApi(token=token)
|
| 117 |
+
|
| 118 |
+
# Folder
|
| 119 |
+
self.folder_path = Path(folder_path).expanduser().resolve()
|
| 120 |
+
self.path_in_repo = path_in_repo or ""
|
| 121 |
+
self.allow_patterns = allow_patterns
|
| 122 |
+
|
| 123 |
+
if ignore_patterns is None:
|
| 124 |
+
ignore_patterns = []
|
| 125 |
+
elif isinstance(ignore_patterns, str):
|
| 126 |
+
ignore_patterns = [ignore_patterns]
|
| 127 |
+
self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
|
| 128 |
+
|
| 129 |
+
if self.folder_path.is_file():
|
| 130 |
+
raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.")
|
| 131 |
+
self.folder_path.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
|
| 133 |
+
# Repository
|
| 134 |
+
repo_url = self.api.create_repo(repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True)
|
| 135 |
+
self.repo_id = repo_url.repo_id
|
| 136 |
+
self.repo_type = repo_type
|
| 137 |
+
self.revision = revision
|
| 138 |
+
self.token = token
|
| 139 |
+
|
| 140 |
+
# Keep track of already uploaded files
|
| 141 |
+
self.last_uploaded: Dict[Path, float] = {} # key is local path, value is timestamp
|
| 142 |
+
|
| 143 |
+
# Scheduler
|
| 144 |
+
if not every > 0:
|
| 145 |
+
raise ValueError(f"'every' must be a positive integer, not '{every}'.")
|
| 146 |
+
self.lock = Lock()
|
| 147 |
+
self.every = every
|
| 148 |
+
self.squash_history = squash_history
|
| 149 |
+
|
| 150 |
+
logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
|
| 151 |
+
self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
|
| 152 |
+
self._scheduler_thread.start()
|
| 153 |
+
atexit.register(self._push_to_hub)
|
| 154 |
+
|
| 155 |
+
self.__stopped = False
|
| 156 |
+
|
| 157 |
+
def stop(self) -> None:
|
| 158 |
+
"""Stop the scheduler.
|
| 159 |
+
|
| 160 |
+
A stopped scheduler cannot be restarted. Mostly for tests purposes.
|
| 161 |
+
"""
|
| 162 |
+
self.__stopped = True
|
| 163 |
+
|
| 164 |
+
def __enter__(self) -> "CommitScheduler":
|
| 165 |
+
return self
|
| 166 |
+
|
| 167 |
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
| 168 |
+
# Upload last changes before exiting
|
| 169 |
+
self.trigger().result()
|
| 170 |
+
self.stop()
|
| 171 |
+
return
|
| 172 |
+
|
| 173 |
+
def _run_scheduler(self) -> None:
|
| 174 |
+
"""Dumb thread waiting between each scheduled push to Hub."""
|
| 175 |
+
while True:
|
| 176 |
+
self.last_future = self.trigger()
|
| 177 |
+
time.sleep(self.every * 60)
|
| 178 |
+
if self.__stopped:
|
| 179 |
+
break
|
| 180 |
+
|
| 181 |
+
def trigger(self) -> Future:
|
| 182 |
+
"""Trigger a `push_to_hub` and return a future.
|
| 183 |
+
|
| 184 |
+
This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
|
| 185 |
+
immediately, without waiting for the next scheduled commit.
|
| 186 |
+
"""
|
| 187 |
+
return self.api.run_as_future(self._push_to_hub)
|
| 188 |
+
|
| 189 |
+
def _push_to_hub(self) -> Optional[CommitInfo]:
|
| 190 |
+
if self.__stopped: # If stopped, already scheduled commits are ignored
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
logger.info("(Background) scheduled commit triggered.")
|
| 194 |
+
try:
|
| 195 |
+
value = self.push_to_hub()
|
| 196 |
+
if self.squash_history:
|
| 197 |
+
logger.info("(Background) squashing repo history.")
|
| 198 |
+
self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
|
| 199 |
+
return value
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
|
| 202 |
+
raise
|
| 203 |
+
|
| 204 |
+
def push_to_hub(self) -> Optional[CommitInfo]:
|
| 205 |
+
"""
|
| 206 |
+
Push folder to the Hub and return the commit info.
|
| 207 |
+
|
| 208 |
+
> [!WARNING]
|
| 209 |
+
> This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
|
| 210 |
+
> queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
|
| 211 |
+
> issues.
|
| 212 |
+
|
| 213 |
+
The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
|
| 214 |
+
uploads only changed files. If no changes are found, the method returns without committing anything. If you want
|
| 215 |
+
to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
|
| 216 |
+
for example to compress data together in a single file before committing. For more details and examples, check
|
| 217 |
+
out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
|
| 218 |
+
"""
|
| 219 |
+
# Check files to upload (with lock)
|
| 220 |
+
with self.lock:
|
| 221 |
+
logger.debug("Listing files to upload for scheduled commit.")
|
| 222 |
+
|
| 223 |
+
# List files from folder (taken from `_prepare_upload_folder_additions`)
|
| 224 |
+
relpath_to_abspath = {
|
| 225 |
+
path.relative_to(self.folder_path).as_posix(): path
|
| 226 |
+
for path in sorted(self.folder_path.glob("**/*")) # sorted to be deterministic
|
| 227 |
+
if path.is_file()
|
| 228 |
+
}
|
| 229 |
+
prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
|
| 230 |
+
|
| 231 |
+
# Filter with pattern + filter out unchanged files + retrieve current file size
|
| 232 |
+
files_to_upload: List[_FileToUpload] = []
|
| 233 |
+
for relpath in filter_repo_objects(
|
| 234 |
+
relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
|
| 235 |
+
):
|
| 236 |
+
local_path = relpath_to_abspath[relpath]
|
| 237 |
+
stat = local_path.stat()
|
| 238 |
+
if self.last_uploaded.get(local_path) is None or self.last_uploaded[local_path] != stat.st_mtime:
|
| 239 |
+
files_to_upload.append(
|
| 240 |
+
_FileToUpload(
|
| 241 |
+
local_path=local_path,
|
| 242 |
+
path_in_repo=prefix + relpath,
|
| 243 |
+
size_limit=stat.st_size,
|
| 244 |
+
last_modified=stat.st_mtime,
|
| 245 |
+
)
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Return if nothing to upload
|
| 249 |
+
if len(files_to_upload) == 0:
|
| 250 |
+
logger.debug("Dropping schedule commit: no changed file to upload.")
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
+
# Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
|
| 254 |
+
logger.debug("Removing unchanged files since previous scheduled commit.")
|
| 255 |
+
add_operations = [
|
| 256 |
+
CommitOperationAdd(
|
| 257 |
+
# Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
|
| 258 |
+
path_or_fileobj=PartialFileIO(file_to_upload.local_path, size_limit=file_to_upload.size_limit),
|
| 259 |
+
path_in_repo=file_to_upload.path_in_repo,
|
| 260 |
+
)
|
| 261 |
+
for file_to_upload in files_to_upload
|
| 262 |
+
]
|
| 263 |
+
|
| 264 |
+
# Upload files (append mode expected - no need for lock)
|
| 265 |
+
logger.debug("Uploading files for scheduled commit.")
|
| 266 |
+
commit_info = self.api.create_commit(
|
| 267 |
+
repo_id=self.repo_id,
|
| 268 |
+
repo_type=self.repo_type,
|
| 269 |
+
operations=add_operations,
|
| 270 |
+
commit_message="Scheduled Commit",
|
| 271 |
+
revision=self.revision,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Successful commit: keep track of the latest "last_modified" for each file
|
| 275 |
+
for file in files_to_upload:
|
| 276 |
+
self.last_uploaded[file.local_path] = file.last_modified
|
| 277 |
+
return commit_info
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
class PartialFileIO(BytesIO):
|
| 281 |
+
"""A file-like object that reads only the first part of a file.
|
| 282 |
+
|
| 283 |
+
Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
|
| 284 |
+
file is uploaded (i.e. the part that was available when the filesystem was first scanned).
|
| 285 |
+
|
| 286 |
+
In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
|
| 287 |
+
disturbance for the user. The object is passed to `CommitOperationAdd`.
|
| 288 |
+
|
| 289 |
+
Only supports `read`, `tell` and `seek` methods.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
file_path (`str` or `Path`):
|
| 293 |
+
Path to the file to read.
|
| 294 |
+
size_limit (`int`):
|
| 295 |
+
The maximum number of bytes to read from the file. If the file is larger than this, only the first part
|
| 296 |
+
will be read (and uploaded).
|
| 297 |
+
"""
|
| 298 |
+
|
| 299 |
+
def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
|
| 300 |
+
self._file_path = Path(file_path)
|
| 301 |
+
self._file = self._file_path.open("rb")
|
| 302 |
+
self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
|
| 303 |
+
|
| 304 |
+
def __del__(self) -> None:
|
| 305 |
+
self._file.close()
|
| 306 |
+
return super().__del__()
|
| 307 |
+
|
| 308 |
+
def __repr__(self) -> str:
|
| 309 |
+
return f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
|
| 310 |
+
|
| 311 |
+
def __len__(self) -> int:
|
| 312 |
+
return self._size_limit
|
| 313 |
+
|
| 314 |
+
def __getattribute__(self, name: str):
|
| 315 |
+
if name.startswith("_") or name in ("read", "tell", "seek"): # only 3 public methods supported
|
| 316 |
+
return super().__getattribute__(name)
|
| 317 |
+
raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
|
| 318 |
+
|
| 319 |
+
def tell(self) -> int:
|
| 320 |
+
"""Return the current file position."""
|
| 321 |
+
return self._file.tell()
|
| 322 |
+
|
| 323 |
+
def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
|
| 324 |
+
"""Change the stream position to the given offset.
|
| 325 |
+
|
| 326 |
+
Behavior is the same as a regular file, except that the position is capped to the size limit.
|
| 327 |
+
"""
|
| 328 |
+
if __whence == SEEK_END:
|
| 329 |
+
# SEEK_END => set from the truncated end
|
| 330 |
+
__offset = len(self) + __offset
|
| 331 |
+
__whence = SEEK_SET
|
| 332 |
+
|
| 333 |
+
pos = self._file.seek(__offset, __whence)
|
| 334 |
+
if pos > self._size_limit:
|
| 335 |
+
return self._file.seek(self._size_limit)
|
| 336 |
+
return pos
|
| 337 |
+
|
| 338 |
+
def read(self, __size: Optional[int] = -1) -> bytes:
|
| 339 |
+
"""Read at most `__size` bytes from the file.
|
| 340 |
+
|
| 341 |
+
Behavior is the same as a regular file, except that it is capped to the size limit.
|
| 342 |
+
"""
|
| 343 |
+
current = self._file.tell()
|
| 344 |
+
if __size is None or __size < 0:
|
| 345 |
+
# Read until file limit
|
| 346 |
+
truncated_size = self._size_limit - current
|
| 347 |
+
else:
|
| 348 |
+
# Read until file limit or __size
|
| 349 |
+
truncated_size = min(__size, self._size_limit - current)
|
| 350 |
+
return self._file.read(truncated_size)
|
venv/lib/python3.13/site-packages/huggingface_hub/_inference_endpoints.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from dataclasses import dataclass, field
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import TYPE_CHECKING, Dict, Optional, Union
|
| 6 |
+
|
| 7 |
+
from huggingface_hub.errors import InferenceEndpointError, InferenceEndpointTimeoutError
|
| 8 |
+
|
| 9 |
+
from .utils import get_session, logging, parse_datetime
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if TYPE_CHECKING:
|
| 13 |
+
from .hf_api import HfApi
|
| 14 |
+
from .inference._client import InferenceClient
|
| 15 |
+
from .inference._generated._async_client import AsyncInferenceClient
|
| 16 |
+
|
| 17 |
+
logger = logging.get_logger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class InferenceEndpointStatus(str, Enum):
|
| 21 |
+
PENDING = "pending"
|
| 22 |
+
INITIALIZING = "initializing"
|
| 23 |
+
UPDATING = "updating"
|
| 24 |
+
UPDATE_FAILED = "updateFailed"
|
| 25 |
+
RUNNING = "running"
|
| 26 |
+
PAUSED = "paused"
|
| 27 |
+
FAILED = "failed"
|
| 28 |
+
SCALED_TO_ZERO = "scaledToZero"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class InferenceEndpointType(str, Enum):
|
| 32 |
+
PUBlIC = "public"
|
| 33 |
+
PROTECTED = "protected"
|
| 34 |
+
PRIVATE = "private"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class InferenceEndpoint:
|
| 39 |
+
"""
|
| 40 |
+
Contains information about a deployed Inference Endpoint.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
name (`str`):
|
| 44 |
+
The unique name of the Inference Endpoint.
|
| 45 |
+
namespace (`str`):
|
| 46 |
+
The namespace where the Inference Endpoint is located.
|
| 47 |
+
repository (`str`):
|
| 48 |
+
The name of the model repository deployed on this Inference Endpoint.
|
| 49 |
+
status ([`InferenceEndpointStatus`]):
|
| 50 |
+
The current status of the Inference Endpoint.
|
| 51 |
+
url (`str`, *optional*):
|
| 52 |
+
The URL of the Inference Endpoint, if available. Only a deployed Inference Endpoint will have a URL.
|
| 53 |
+
framework (`str`):
|
| 54 |
+
The machine learning framework used for the model.
|
| 55 |
+
revision (`str`):
|
| 56 |
+
The specific model revision deployed on the Inference Endpoint.
|
| 57 |
+
task (`str`):
|
| 58 |
+
The task associated with the deployed model.
|
| 59 |
+
created_at (`datetime.datetime`):
|
| 60 |
+
The timestamp when the Inference Endpoint was created.
|
| 61 |
+
updated_at (`datetime.datetime`):
|
| 62 |
+
The timestamp of the last update of the Inference Endpoint.
|
| 63 |
+
type ([`InferenceEndpointType`]):
|
| 64 |
+
The type of the Inference Endpoint (public, protected, private).
|
| 65 |
+
raw (`Dict`):
|
| 66 |
+
The raw dictionary data returned from the API.
|
| 67 |
+
token (`str` or `bool`, *optional*):
|
| 68 |
+
Authentication token for the Inference Endpoint, if set when requesting the API. Will default to the
|
| 69 |
+
locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server.
|
| 70 |
+
|
| 71 |
+
Example:
|
| 72 |
+
```python
|
| 73 |
+
>>> from huggingface_hub import get_inference_endpoint
|
| 74 |
+
>>> endpoint = get_inference_endpoint("my-text-to-image")
|
| 75 |
+
>>> endpoint
|
| 76 |
+
InferenceEndpoint(name='my-text-to-image', ...)
|
| 77 |
+
|
| 78 |
+
# Get status
|
| 79 |
+
>>> endpoint.status
|
| 80 |
+
'running'
|
| 81 |
+
>>> endpoint.url
|
| 82 |
+
'https://my-text-to-image.region.vendor.endpoints.huggingface.cloud'
|
| 83 |
+
|
| 84 |
+
# Run inference
|
| 85 |
+
>>> endpoint.client.text_to_image(...)
|
| 86 |
+
|
| 87 |
+
# Pause endpoint to save $$$
|
| 88 |
+
>>> endpoint.pause()
|
| 89 |
+
|
| 90 |
+
# ...
|
| 91 |
+
# Resume and wait for deployment
|
| 92 |
+
>>> endpoint.resume()
|
| 93 |
+
>>> endpoint.wait()
|
| 94 |
+
>>> endpoint.client.text_to_image(...)
|
| 95 |
+
```
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
# Field in __repr__
|
| 99 |
+
name: str = field(init=False)
|
| 100 |
+
namespace: str
|
| 101 |
+
repository: str = field(init=False)
|
| 102 |
+
status: InferenceEndpointStatus = field(init=False)
|
| 103 |
+
health_route: str = field(init=False)
|
| 104 |
+
url: Optional[str] = field(init=False)
|
| 105 |
+
|
| 106 |
+
# Other fields
|
| 107 |
+
framework: str = field(repr=False, init=False)
|
| 108 |
+
revision: str = field(repr=False, init=False)
|
| 109 |
+
task: str = field(repr=False, init=False)
|
| 110 |
+
created_at: datetime = field(repr=False, init=False)
|
| 111 |
+
updated_at: datetime = field(repr=False, init=False)
|
| 112 |
+
type: InferenceEndpointType = field(repr=False, init=False)
|
| 113 |
+
|
| 114 |
+
# Raw dict from the API
|
| 115 |
+
raw: Dict = field(repr=False)
|
| 116 |
+
|
| 117 |
+
# Internal fields
|
| 118 |
+
_token: Union[str, bool, None] = field(repr=False, compare=False)
|
| 119 |
+
_api: "HfApi" = field(repr=False, compare=False)
|
| 120 |
+
|
| 121 |
+
@classmethod
|
| 122 |
+
def from_raw(
|
| 123 |
+
cls, raw: Dict, namespace: str, token: Union[str, bool, None] = None, api: Optional["HfApi"] = None
|
| 124 |
+
) -> "InferenceEndpoint":
|
| 125 |
+
"""Initialize object from raw dictionary."""
|
| 126 |
+
if api is None:
|
| 127 |
+
from .hf_api import HfApi
|
| 128 |
+
|
| 129 |
+
api = HfApi()
|
| 130 |
+
if token is None:
|
| 131 |
+
token = api.token
|
| 132 |
+
|
| 133 |
+
# All other fields are populated in __post_init__
|
| 134 |
+
return cls(raw=raw, namespace=namespace, _token=token, _api=api)
|
| 135 |
+
|
| 136 |
+
def __post_init__(self) -> None:
|
| 137 |
+
"""Populate fields from raw dictionary."""
|
| 138 |
+
self._populate_from_raw()
|
| 139 |
+
|
| 140 |
+
@property
|
| 141 |
+
def client(self) -> "InferenceClient":
|
| 142 |
+
"""Returns a client to make predictions on this Inference Endpoint.
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
[`InferenceClient`]: an inference client pointing to the deployed endpoint.
|
| 146 |
+
|
| 147 |
+
Raises:
|
| 148 |
+
[`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
|
| 149 |
+
"""
|
| 150 |
+
if self.url is None:
|
| 151 |
+
raise InferenceEndpointError(
|
| 152 |
+
"Cannot create a client for this Inference Endpoint as it is not yet deployed. "
|
| 153 |
+
"Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
|
| 154 |
+
)
|
| 155 |
+
from .inference._client import InferenceClient
|
| 156 |
+
|
| 157 |
+
return InferenceClient(
|
| 158 |
+
model=self.url,
|
| 159 |
+
token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
@property
|
| 163 |
+
def async_client(self) -> "AsyncInferenceClient":
|
| 164 |
+
"""Returns a client to make predictions on this Inference Endpoint.
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
[`AsyncInferenceClient`]: an asyncio-compatible inference client pointing to the deployed endpoint.
|
| 168 |
+
|
| 169 |
+
Raises:
|
| 170 |
+
[`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
|
| 171 |
+
"""
|
| 172 |
+
if self.url is None:
|
| 173 |
+
raise InferenceEndpointError(
|
| 174 |
+
"Cannot create a client for this Inference Endpoint as it is not yet deployed. "
|
| 175 |
+
"Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
|
| 176 |
+
)
|
| 177 |
+
from .inference._generated._async_client import AsyncInferenceClient
|
| 178 |
+
|
| 179 |
+
return AsyncInferenceClient(
|
| 180 |
+
model=self.url,
|
| 181 |
+
token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
def wait(self, timeout: Optional[int] = None, refresh_every: int = 5) -> "InferenceEndpoint":
|
| 185 |
+
"""Wait for the Inference Endpoint to be deployed.
|
| 186 |
+
|
| 187 |
+
Information from the server will be fetched every 1s. If the Inference Endpoint is not deployed after `timeout`
|
| 188 |
+
seconds, a [`InferenceEndpointTimeoutError`] will be raised. The [`InferenceEndpoint`] will be mutated in place with the latest
|
| 189 |
+
data.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
timeout (`int`, *optional*):
|
| 193 |
+
The maximum time to wait for the Inference Endpoint to be deployed, in seconds. If `None`, will wait
|
| 194 |
+
indefinitely.
|
| 195 |
+
refresh_every (`int`, *optional*):
|
| 196 |
+
The time to wait between each fetch of the Inference Endpoint status, in seconds. Defaults to 5s.
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 200 |
+
|
| 201 |
+
Raises:
|
| 202 |
+
[`InferenceEndpointError`]
|
| 203 |
+
If the Inference Endpoint ended up in a failed state.
|
| 204 |
+
[`InferenceEndpointTimeoutError`]
|
| 205 |
+
If the Inference Endpoint is not deployed after `timeout` seconds.
|
| 206 |
+
"""
|
| 207 |
+
if timeout is not None and timeout < 0:
|
| 208 |
+
raise ValueError("`timeout` cannot be negative.")
|
| 209 |
+
if refresh_every <= 0:
|
| 210 |
+
raise ValueError("`refresh_every` must be positive.")
|
| 211 |
+
|
| 212 |
+
start = time.time()
|
| 213 |
+
while True:
|
| 214 |
+
if self.status == InferenceEndpointStatus.FAILED:
|
| 215 |
+
raise InferenceEndpointError(
|
| 216 |
+
f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
|
| 217 |
+
)
|
| 218 |
+
if self.status == InferenceEndpointStatus.UPDATE_FAILED:
|
| 219 |
+
raise InferenceEndpointError(
|
| 220 |
+
f"Inference Endpoint {self.name} failed to update. Please check the logs for more information."
|
| 221 |
+
)
|
| 222 |
+
if self.status == InferenceEndpointStatus.RUNNING and self.url is not None:
|
| 223 |
+
# Verify the endpoint is actually reachable
|
| 224 |
+
_health_url = f"{self.url.rstrip('/')}/{self.health_route.lstrip('/')}"
|
| 225 |
+
response = get_session().get(_health_url, headers=self._api._build_hf_headers(token=self._token))
|
| 226 |
+
if response.status_code == 200:
|
| 227 |
+
logger.info("Inference Endpoint is ready to be used.")
|
| 228 |
+
return self
|
| 229 |
+
|
| 230 |
+
if timeout is not None:
|
| 231 |
+
if time.time() - start > timeout:
|
| 232 |
+
raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
|
| 233 |
+
logger.info(f"Inference Endpoint is not deployed yet ({self.status}). Waiting {refresh_every}s...")
|
| 234 |
+
time.sleep(refresh_every)
|
| 235 |
+
self.fetch()
|
| 236 |
+
|
| 237 |
+
def fetch(self) -> "InferenceEndpoint":
|
| 238 |
+
"""Fetch latest information about the Inference Endpoint.
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 242 |
+
"""
|
| 243 |
+
obj = self._api.get_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
| 244 |
+
self.raw = obj.raw
|
| 245 |
+
self._populate_from_raw()
|
| 246 |
+
return self
|
| 247 |
+
|
| 248 |
+
def update(
|
| 249 |
+
self,
|
| 250 |
+
*,
|
| 251 |
+
# Compute update
|
| 252 |
+
accelerator: Optional[str] = None,
|
| 253 |
+
instance_size: Optional[str] = None,
|
| 254 |
+
instance_type: Optional[str] = None,
|
| 255 |
+
min_replica: Optional[int] = None,
|
| 256 |
+
max_replica: Optional[int] = None,
|
| 257 |
+
scale_to_zero_timeout: Optional[int] = None,
|
| 258 |
+
# Model update
|
| 259 |
+
repository: Optional[str] = None,
|
| 260 |
+
framework: Optional[str] = None,
|
| 261 |
+
revision: Optional[str] = None,
|
| 262 |
+
task: Optional[str] = None,
|
| 263 |
+
custom_image: Optional[Dict] = None,
|
| 264 |
+
secrets: Optional[Dict[str, str]] = None,
|
| 265 |
+
) -> "InferenceEndpoint":
|
| 266 |
+
"""Update the Inference Endpoint.
|
| 267 |
+
|
| 268 |
+
This method allows the update of either the compute configuration, the deployed model, or both. All arguments are
|
| 269 |
+
optional but at least one must be provided.
|
| 270 |
+
|
| 271 |
+
This is an alias for [`HfApi.update_inference_endpoint`]. The current object is mutated in place with the
|
| 272 |
+
latest data from the server.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
accelerator (`str`, *optional*):
|
| 276 |
+
The hardware accelerator to be used for inference (e.g. `"cpu"`).
|
| 277 |
+
instance_size (`str`, *optional*):
|
| 278 |
+
The size or type of the instance to be used for hosting the model (e.g. `"x4"`).
|
| 279 |
+
instance_type (`str`, *optional*):
|
| 280 |
+
The cloud instance type where the Inference Endpoint will be deployed (e.g. `"intel-icl"`).
|
| 281 |
+
min_replica (`int`, *optional*):
|
| 282 |
+
The minimum number of replicas (instances) to keep running for the Inference Endpoint.
|
| 283 |
+
max_replica (`int`, *optional*):
|
| 284 |
+
The maximum number of replicas (instances) to scale to for the Inference Endpoint.
|
| 285 |
+
scale_to_zero_timeout (`int`, *optional*):
|
| 286 |
+
The duration in minutes before an inactive endpoint is scaled to zero.
|
| 287 |
+
|
| 288 |
+
repository (`str`, *optional*):
|
| 289 |
+
The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
|
| 290 |
+
framework (`str`, *optional*):
|
| 291 |
+
The machine learning framework used for the model (e.g. `"custom"`).
|
| 292 |
+
revision (`str`, *optional*):
|
| 293 |
+
The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
|
| 294 |
+
task (`str`, *optional*):
|
| 295 |
+
The task on which to deploy the model (e.g. `"text-classification"`).
|
| 296 |
+
custom_image (`Dict`, *optional*):
|
| 297 |
+
A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
|
| 298 |
+
Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
|
| 299 |
+
secrets (`Dict[str, str]`, *optional*):
|
| 300 |
+
Secret values to inject in the container environment.
|
| 301 |
+
Returns:
|
| 302 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 303 |
+
"""
|
| 304 |
+
# Make API call
|
| 305 |
+
obj = self._api.update_inference_endpoint(
|
| 306 |
+
name=self.name,
|
| 307 |
+
namespace=self.namespace,
|
| 308 |
+
accelerator=accelerator,
|
| 309 |
+
instance_size=instance_size,
|
| 310 |
+
instance_type=instance_type,
|
| 311 |
+
min_replica=min_replica,
|
| 312 |
+
max_replica=max_replica,
|
| 313 |
+
scale_to_zero_timeout=scale_to_zero_timeout,
|
| 314 |
+
repository=repository,
|
| 315 |
+
framework=framework,
|
| 316 |
+
revision=revision,
|
| 317 |
+
task=task,
|
| 318 |
+
custom_image=custom_image,
|
| 319 |
+
secrets=secrets,
|
| 320 |
+
token=self._token, # type: ignore [arg-type]
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# Mutate current object
|
| 324 |
+
self.raw = obj.raw
|
| 325 |
+
self._populate_from_raw()
|
| 326 |
+
return self
|
| 327 |
+
|
| 328 |
+
def pause(self) -> "InferenceEndpoint":
|
| 329 |
+
"""Pause the Inference Endpoint.
|
| 330 |
+
|
| 331 |
+
A paused Inference Endpoint will not be charged. It can be resumed at any time using [`InferenceEndpoint.resume`].
|
| 332 |
+
This is different than scaling the Inference Endpoint to zero with [`InferenceEndpoint.scale_to_zero`], which
|
| 333 |
+
would be automatically restarted when a request is made to it.
|
| 334 |
+
|
| 335 |
+
This is an alias for [`HfApi.pause_inference_endpoint`]. The current object is mutated in place with the
|
| 336 |
+
latest data from the server.
|
| 337 |
+
|
| 338 |
+
Returns:
|
| 339 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 340 |
+
"""
|
| 341 |
+
obj = self._api.pause_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
| 342 |
+
self.raw = obj.raw
|
| 343 |
+
self._populate_from_raw()
|
| 344 |
+
return self
|
| 345 |
+
|
| 346 |
+
def resume(self, running_ok: bool = True) -> "InferenceEndpoint":
|
| 347 |
+
"""Resume the Inference Endpoint.
|
| 348 |
+
|
| 349 |
+
This is an alias for [`HfApi.resume_inference_endpoint`]. The current object is mutated in place with the
|
| 350 |
+
latest data from the server.
|
| 351 |
+
|
| 352 |
+
Args:
|
| 353 |
+
running_ok (`bool`, *optional*):
|
| 354 |
+
If `True`, the method will not raise an error if the Inference Endpoint is already running. Defaults to
|
| 355 |
+
`True`.
|
| 356 |
+
|
| 357 |
+
Returns:
|
| 358 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 359 |
+
"""
|
| 360 |
+
obj = self._api.resume_inference_endpoint(
|
| 361 |
+
name=self.name, namespace=self.namespace, running_ok=running_ok, token=self._token
|
| 362 |
+
) # type: ignore [arg-type]
|
| 363 |
+
self.raw = obj.raw
|
| 364 |
+
self._populate_from_raw()
|
| 365 |
+
return self
|
| 366 |
+
|
| 367 |
+
def scale_to_zero(self) -> "InferenceEndpoint":
|
| 368 |
+
"""Scale Inference Endpoint to zero.
|
| 369 |
+
|
| 370 |
+
An Inference Endpoint scaled to zero will not be charged. It will be resume on the next request to it, with a
|
| 371 |
+
cold start delay. This is different than pausing the Inference Endpoint with [`InferenceEndpoint.pause`], which
|
| 372 |
+
would require a manual resume with [`InferenceEndpoint.resume`].
|
| 373 |
+
|
| 374 |
+
This is an alias for [`HfApi.scale_to_zero_inference_endpoint`]. The current object is mutated in place with the
|
| 375 |
+
latest data from the server.
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
| 379 |
+
"""
|
| 380 |
+
obj = self._api.scale_to_zero_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
| 381 |
+
self.raw = obj.raw
|
| 382 |
+
self._populate_from_raw()
|
| 383 |
+
return self
|
| 384 |
+
|
| 385 |
+
def delete(self) -> None:
|
| 386 |
+
"""Delete the Inference Endpoint.
|
| 387 |
+
|
| 388 |
+
This operation is not reversible. If you don't want to be charged for an Inference Endpoint, it is preferable
|
| 389 |
+
to pause it with [`InferenceEndpoint.pause`] or scale it to zero with [`InferenceEndpoint.scale_to_zero`].
|
| 390 |
+
|
| 391 |
+
This is an alias for [`HfApi.delete_inference_endpoint`].
|
| 392 |
+
"""
|
| 393 |
+
self._api.delete_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
| 394 |
+
|
| 395 |
+
def _populate_from_raw(self) -> None:
|
| 396 |
+
"""Populate fields from raw dictionary.
|
| 397 |
+
|
| 398 |
+
Called in __post_init__ + each time the Inference Endpoint is updated.
|
| 399 |
+
"""
|
| 400 |
+
# Repr fields
|
| 401 |
+
self.name = self.raw["name"]
|
| 402 |
+
self.repository = self.raw["model"]["repository"]
|
| 403 |
+
self.status = self.raw["status"]["state"]
|
| 404 |
+
self.url = self.raw["status"].get("url")
|
| 405 |
+
self.health_route = self.raw["healthRoute"]
|
| 406 |
+
|
| 407 |
+
# Other fields
|
| 408 |
+
self.framework = self.raw["model"]["framework"]
|
| 409 |
+
self.revision = self.raw["model"]["revision"]
|
| 410 |
+
self.task = self.raw["model"]["task"]
|
| 411 |
+
self.created_at = parse_datetime(self.raw["status"]["createdAt"])
|
| 412 |
+
self.updated_at = parse_datetime(self.raw["status"]["updatedAt"])
|
| 413 |
+
self.type = self.raw["type"]
|
venv/lib/python3.13/site-packages/huggingface_hub/_jobs_api.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2025-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
from enum import Enum
|
| 18 |
+
from typing import Any, Dict, List, Optional, Union
|
| 19 |
+
|
| 20 |
+
from huggingface_hub import constants
|
| 21 |
+
from huggingface_hub._space_api import SpaceHardware
|
| 22 |
+
from huggingface_hub.utils._datetime import parse_datetime
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class JobStage(str, Enum):
|
| 26 |
+
"""
|
| 27 |
+
Enumeration of possible stage of a Job on the Hub.
|
| 28 |
+
|
| 29 |
+
Value can be compared to a string:
|
| 30 |
+
```py
|
| 31 |
+
assert JobStage.COMPLETED == "COMPLETED"
|
| 32 |
+
```
|
| 33 |
+
Possible values are: `COMPLETED`, `CANCELED`, `ERROR`, `DELETED`, `RUNNING`.
|
| 34 |
+
Taken from https://github.com/huggingface/moon-landing/blob/main/server/job_types/JobInfo.ts#L61 (private url).
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
# Copied from moon-landing > server > lib > Job.ts
|
| 38 |
+
COMPLETED = "COMPLETED"
|
| 39 |
+
CANCELED = "CANCELED"
|
| 40 |
+
ERROR = "ERROR"
|
| 41 |
+
DELETED = "DELETED"
|
| 42 |
+
RUNNING = "RUNNING"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class JobStatus:
|
| 47 |
+
stage: JobStage
|
| 48 |
+
message: Optional[str]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class JobOwner:
|
| 53 |
+
id: str
|
| 54 |
+
name: str
|
| 55 |
+
type: str
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class JobInfo:
|
| 60 |
+
"""
|
| 61 |
+
Contains information about a Job.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
id (`str`):
|
| 65 |
+
Job ID.
|
| 66 |
+
created_at (`datetime` or `None`):
|
| 67 |
+
When the Job was created.
|
| 68 |
+
docker_image (`str` or `None`):
|
| 69 |
+
The Docker image from Docker Hub used for the Job.
|
| 70 |
+
Can be None if space_id is present instead.
|
| 71 |
+
space_id (`str` or `None`):
|
| 72 |
+
The Docker image from Hugging Face Spaces used for the Job.
|
| 73 |
+
Can be None if docker_image is present instead.
|
| 74 |
+
command (`List[str]` or `None`):
|
| 75 |
+
Command of the Job, e.g. `["python", "-c", "print('hello world')"]`
|
| 76 |
+
arguments (`List[str]` or `None`):
|
| 77 |
+
Arguments passed to the command
|
| 78 |
+
environment (`Dict[str]` or `None`):
|
| 79 |
+
Environment variables of the Job as a dictionary.
|
| 80 |
+
secrets (`Dict[str]` or `None`):
|
| 81 |
+
Secret environment variables of the Job (encrypted).
|
| 82 |
+
flavor (`str` or `None`):
|
| 83 |
+
Flavor for the hardware, as in Hugging Face Spaces. See [`SpaceHardware`] for possible values.
|
| 84 |
+
E.g. `"cpu-basic"`.
|
| 85 |
+
status: (`JobStatus` or `None`):
|
| 86 |
+
Status of the Job, e.g. `JobStatus(stage="RUNNING", message=None)`
|
| 87 |
+
See [`JobStage`] for possible stage values.
|
| 88 |
+
owner: (`JobOwner` or `None`):
|
| 89 |
+
Owner of the Job, e.g. `JobOwner(id="5e9ecfc04957053f60648a3e", name="lhoestq", type="user")`
|
| 90 |
+
|
| 91 |
+
Example:
|
| 92 |
+
|
| 93 |
+
```python
|
| 94 |
+
>>> from huggingface_hub import run_job
|
| 95 |
+
>>> job = run_job(
|
| 96 |
+
... image="python:3.12",
|
| 97 |
+
... command=["python", "-c", "print('Hello from the cloud!')"]
|
| 98 |
+
... )
|
| 99 |
+
>>> job
|
| 100 |
+
JobInfo(id='687fb701029421ae5549d998', created_at=datetime.datetime(2025, 7, 22, 16, 6, 25, 79000, tzinfo=datetime.timezone.utc), docker_image='python:3.12', space_id=None, command=['python', '-c', "print('Hello from the cloud!')"], arguments=[], environment={}, secrets={}, flavor='cpu-basic', status=JobStatus(stage='RUNNING', message=None), owner=JobOwner(id='5e9ecfc04957053f60648a3e', name='lhoestq', type='user'), endpoint='https://huggingface.co', url='https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998')
|
| 101 |
+
>>> job.id
|
| 102 |
+
'687fb701029421ae5549d998'
|
| 103 |
+
>>> job.url
|
| 104 |
+
'https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998'
|
| 105 |
+
>>> job.status.stage
|
| 106 |
+
'RUNNING'
|
| 107 |
+
```
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
id: str
|
| 111 |
+
created_at: Optional[datetime]
|
| 112 |
+
docker_image: Optional[str]
|
| 113 |
+
space_id: Optional[str]
|
| 114 |
+
command: Optional[List[str]]
|
| 115 |
+
arguments: Optional[List[str]]
|
| 116 |
+
environment: Optional[Dict[str, Any]]
|
| 117 |
+
secrets: Optional[Dict[str, Any]]
|
| 118 |
+
flavor: Optional[SpaceHardware]
|
| 119 |
+
status: JobStatus
|
| 120 |
+
owner: JobOwner
|
| 121 |
+
|
| 122 |
+
# Inferred fields
|
| 123 |
+
endpoint: str
|
| 124 |
+
url: str
|
| 125 |
+
|
| 126 |
+
def __init__(self, **kwargs) -> None:
|
| 127 |
+
self.id = kwargs["id"]
|
| 128 |
+
created_at = kwargs.get("createdAt") or kwargs.get("created_at")
|
| 129 |
+
self.created_at = parse_datetime(created_at) if created_at else None
|
| 130 |
+
self.docker_image = kwargs.get("dockerImage") or kwargs.get("docker_image")
|
| 131 |
+
self.space_id = kwargs.get("spaceId") or kwargs.get("space_id")
|
| 132 |
+
owner = kwargs.get("owner", {})
|
| 133 |
+
self.owner = JobOwner(id=owner["id"], name=owner["name"], type=owner["type"])
|
| 134 |
+
self.command = kwargs.get("command")
|
| 135 |
+
self.arguments = kwargs.get("arguments")
|
| 136 |
+
self.environment = kwargs.get("environment")
|
| 137 |
+
self.secrets = kwargs.get("secrets")
|
| 138 |
+
self.flavor = kwargs.get("flavor")
|
| 139 |
+
status = kwargs.get("status", {})
|
| 140 |
+
self.status = JobStatus(stage=status["stage"], message=status.get("message"))
|
| 141 |
+
|
| 142 |
+
# Inferred fields
|
| 143 |
+
self.endpoint = kwargs.get("endpoint", constants.ENDPOINT)
|
| 144 |
+
self.url = f"{self.endpoint}/jobs/{self.owner.name}/{self.id}"
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
@dataclass
|
| 148 |
+
class JobSpec:
|
| 149 |
+
docker_image: Optional[str]
|
| 150 |
+
space_id: Optional[str]
|
| 151 |
+
command: Optional[List[str]]
|
| 152 |
+
arguments: Optional[List[str]]
|
| 153 |
+
environment: Optional[Dict[str, Any]]
|
| 154 |
+
secrets: Optional[Dict[str, Any]]
|
| 155 |
+
flavor: Optional[SpaceHardware]
|
| 156 |
+
timeout: Optional[int]
|
| 157 |
+
tags: Optional[List[str]]
|
| 158 |
+
arch: Optional[str]
|
| 159 |
+
|
| 160 |
+
def __init__(self, **kwargs) -> None:
|
| 161 |
+
self.docker_image = kwargs.get("dockerImage") or kwargs.get("docker_image")
|
| 162 |
+
self.space_id = kwargs.get("spaceId") or kwargs.get("space_id")
|
| 163 |
+
self.command = kwargs.get("command")
|
| 164 |
+
self.arguments = kwargs.get("arguments")
|
| 165 |
+
self.environment = kwargs.get("environment")
|
| 166 |
+
self.secrets = kwargs.get("secrets")
|
| 167 |
+
self.flavor = kwargs.get("flavor")
|
| 168 |
+
self.timeout = kwargs.get("timeout")
|
| 169 |
+
self.tags = kwargs.get("tags")
|
| 170 |
+
self.arch = kwargs.get("arch")
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@dataclass
|
| 174 |
+
class LastJobInfo:
|
| 175 |
+
id: str
|
| 176 |
+
at: datetime
|
| 177 |
+
|
| 178 |
+
def __init__(self, **kwargs) -> None:
|
| 179 |
+
self.id = kwargs["id"]
|
| 180 |
+
self.at = parse_datetime(kwargs["at"])
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@dataclass
|
| 184 |
+
class ScheduledJobStatus:
|
| 185 |
+
last_job: Optional[LastJobInfo]
|
| 186 |
+
next_job_run_at: Optional[datetime]
|
| 187 |
+
|
| 188 |
+
def __init__(self, **kwargs) -> None:
|
| 189 |
+
last_job = kwargs.get("lastJob") or kwargs.get("last_job")
|
| 190 |
+
self.last_job = LastJobInfo(**last_job) if last_job else None
|
| 191 |
+
next_job_run_at = kwargs.get("nextJobRunAt") or kwargs.get("next_job_run_at")
|
| 192 |
+
self.next_job_run_at = parse_datetime(str(next_job_run_at)) if next_job_run_at else None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
@dataclass
|
| 196 |
+
class ScheduledJobInfo:
|
| 197 |
+
"""
|
| 198 |
+
Contains information about a Job.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
id (`str`):
|
| 202 |
+
Scheduled Job ID.
|
| 203 |
+
created_at (`datetime` or `None`):
|
| 204 |
+
When the scheduled Job was created.
|
| 205 |
+
tags (`List[str]` or `None`):
|
| 206 |
+
The tags of the scheduled Job.
|
| 207 |
+
schedule (`str` or `None`):
|
| 208 |
+
One of "@annually", "@yearly", "@monthly", "@weekly", "@daily", "@hourly", or a
|
| 209 |
+
CRON schedule expression (e.g., '0 9 * * 1' for 9 AM every Monday).
|
| 210 |
+
suspend (`bool` or `None`):
|
| 211 |
+
Whether the scheduled job is suspended (paused).
|
| 212 |
+
concurrency (`bool` or `None`):
|
| 213 |
+
Whether multiple instances of this Job can run concurrently.
|
| 214 |
+
status (`ScheduledJobStatus` or `None`):
|
| 215 |
+
Status of the scheduled Job.
|
| 216 |
+
owner: (`JobOwner` or `None`):
|
| 217 |
+
Owner of the scheduled Job, e.g. `JobOwner(id="5e9ecfc04957053f60648a3e", name="lhoestq", type="user")`
|
| 218 |
+
job_spec: (`JobSpec` or `None`):
|
| 219 |
+
Specifications of the Job.
|
| 220 |
+
|
| 221 |
+
Example:
|
| 222 |
+
|
| 223 |
+
```python
|
| 224 |
+
>>> from huggingface_hub import run_job
|
| 225 |
+
>>> scheduled_job = create_scheduled_job(
|
| 226 |
+
... image="python:3.12",
|
| 227 |
+
... command=["python", "-c", "print('Hello from the cloud!')"],
|
| 228 |
+
... schedule="@hourly",
|
| 229 |
+
... )
|
| 230 |
+
>>> scheduled_job.id
|
| 231 |
+
'687fb701029421ae5549d999'
|
| 232 |
+
>>> scheduled_job.status.next_job_run_at
|
| 233 |
+
datetime.datetime(2025, 7, 22, 17, 6, 25, 79000, tzinfo=datetime.timezone.utc)
|
| 234 |
+
```
|
| 235 |
+
"""
|
| 236 |
+
|
| 237 |
+
id: str
|
| 238 |
+
created_at: Optional[datetime]
|
| 239 |
+
job_spec: JobSpec
|
| 240 |
+
schedule: Optional[str]
|
| 241 |
+
suspend: Optional[bool]
|
| 242 |
+
concurrency: Optional[bool]
|
| 243 |
+
status: ScheduledJobStatus
|
| 244 |
+
owner: JobOwner
|
| 245 |
+
|
| 246 |
+
def __init__(self, **kwargs) -> None:
|
| 247 |
+
self.id = kwargs["id"]
|
| 248 |
+
created_at = kwargs.get("createdAt") or kwargs.get("created_at")
|
| 249 |
+
self.created_at = parse_datetime(created_at) if created_at else None
|
| 250 |
+
self.job_spec = JobSpec(**(kwargs.get("job_spec") or kwargs.get("jobSpec", {})))
|
| 251 |
+
self.schedule = kwargs.get("schedule")
|
| 252 |
+
self.suspend = kwargs.get("suspend")
|
| 253 |
+
self.concurrency = kwargs.get("concurrency")
|
| 254 |
+
status = kwargs.get("status", {})
|
| 255 |
+
self.status = ScheduledJobStatus(
|
| 256 |
+
last_job=status.get("last_job") or status.get("lastJob"),
|
| 257 |
+
next_job_run_at=status.get("next_job_run_at") or status.get("nextJobRunAt"),
|
| 258 |
+
)
|
| 259 |
+
owner = kwargs.get("owner", {})
|
| 260 |
+
self.owner = JobOwner(id=owner["id"], name=owner["name"], type=owner["type"])
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def _create_job_spec(
|
| 264 |
+
*,
|
| 265 |
+
image: str,
|
| 266 |
+
command: List[str],
|
| 267 |
+
env: Optional[Dict[str, Any]],
|
| 268 |
+
secrets: Optional[Dict[str, Any]],
|
| 269 |
+
flavor: Optional[SpaceHardware],
|
| 270 |
+
timeout: Optional[Union[int, float, str]],
|
| 271 |
+
) -> Dict[str, Any]:
|
| 272 |
+
# prepare job spec to send to HF Jobs API
|
| 273 |
+
job_spec: Dict[str, Any] = {
|
| 274 |
+
"command": command,
|
| 275 |
+
"arguments": [],
|
| 276 |
+
"environment": env or {},
|
| 277 |
+
"flavor": flavor or SpaceHardware.CPU_BASIC,
|
| 278 |
+
}
|
| 279 |
+
# secrets are optional
|
| 280 |
+
if secrets:
|
| 281 |
+
job_spec["secrets"] = secrets
|
| 282 |
+
# timeout is optional
|
| 283 |
+
if timeout:
|
| 284 |
+
time_units_factors = {"s": 1, "m": 60, "h": 3600, "d": 3600 * 24}
|
| 285 |
+
if isinstance(timeout, str) and timeout[-1] in time_units_factors:
|
| 286 |
+
job_spec["timeoutSeconds"] = int(float(timeout[:-1]) * time_units_factors[timeout[-1]])
|
| 287 |
+
else:
|
| 288 |
+
job_spec["timeoutSeconds"] = int(timeout)
|
| 289 |
+
# input is either from docker hub or from HF spaces
|
| 290 |
+
for prefix in (
|
| 291 |
+
"https://huggingface.co/spaces/",
|
| 292 |
+
"https://hf.co/spaces/",
|
| 293 |
+
"huggingface.co/spaces/",
|
| 294 |
+
"hf.co/spaces/",
|
| 295 |
+
):
|
| 296 |
+
if image.startswith(prefix):
|
| 297 |
+
job_spec["spaceId"] = image[len(prefix) :]
|
| 298 |
+
break
|
| 299 |
+
else:
|
| 300 |
+
job_spec["dockerImage"] = image
|
| 301 |
+
return job_spec
|
venv/lib/python3.13/site-packages/huggingface_hub/_local_folder.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2024-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""Contains utilities to handle the `../.cache/huggingface` folder in local directories.
|
| 16 |
+
|
| 17 |
+
First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
|
| 18 |
+
download metadata when downloading files from the hub to a local directory (without
|
| 19 |
+
using the cache).
|
| 20 |
+
|
| 21 |
+
./.cache/huggingface folder structure:
|
| 22 |
+
[4.0K] data
|
| 23 |
+
├── [4.0K] .cache
|
| 24 |
+
│ └── [4.0K] huggingface
|
| 25 |
+
│ └── [4.0K] download
|
| 26 |
+
│ ├── [ 16] file.parquet.metadata
|
| 27 |
+
│ ├── [ 16] file.txt.metadata
|
| 28 |
+
│ └── [4.0K] folder
|
| 29 |
+
│ └── [ 16] file.parquet.metadata
|
| 30 |
+
│
|
| 31 |
+
├── [6.5G] file.parquet
|
| 32 |
+
├── [1.5K] file.txt
|
| 33 |
+
└── [4.0K] folder
|
| 34 |
+
└── [ 16] file.parquet
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
Download metadata file structure:
|
| 38 |
+
```
|
| 39 |
+
# file.txt.metadata
|
| 40 |
+
11c5a3d5811f50298f278a704980280950aedb10
|
| 41 |
+
a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
|
| 42 |
+
1712656091.123
|
| 43 |
+
|
| 44 |
+
# file.parquet.metadata
|
| 45 |
+
11c5a3d5811f50298f278a704980280950aedb10
|
| 46 |
+
7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
|
| 47 |
+
1712656091.123
|
| 48 |
+
}
|
| 49 |
+
```
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
import base64
|
| 53 |
+
import hashlib
|
| 54 |
+
import logging
|
| 55 |
+
import os
|
| 56 |
+
import time
|
| 57 |
+
from dataclasses import dataclass
|
| 58 |
+
from pathlib import Path
|
| 59 |
+
from typing import Optional
|
| 60 |
+
|
| 61 |
+
from .utils import WeakFileLock
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
logger = logging.getLogger(__name__)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class LocalDownloadFilePaths:
|
| 69 |
+
"""
|
| 70 |
+
Paths to the files related to a download process in a local dir.
|
| 71 |
+
|
| 72 |
+
Returned by [`get_local_download_paths`].
|
| 73 |
+
|
| 74 |
+
Attributes:
|
| 75 |
+
file_path (`Path`):
|
| 76 |
+
Path where the file will be saved.
|
| 77 |
+
lock_path (`Path`):
|
| 78 |
+
Path to the lock file used to ensure atomicity when reading/writing metadata.
|
| 79 |
+
metadata_path (`Path`):
|
| 80 |
+
Path to the metadata file.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
file_path: Path
|
| 84 |
+
lock_path: Path
|
| 85 |
+
metadata_path: Path
|
| 86 |
+
|
| 87 |
+
def incomplete_path(self, etag: str) -> Path:
|
| 88 |
+
"""Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
|
| 89 |
+
path = self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
|
| 90 |
+
resolved_path = str(path.resolve())
|
| 91 |
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
| 92 |
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
| 93 |
+
if os.name == "nt" and len(resolved_path) > 255 and not resolved_path.startswith("\\\\?\\"):
|
| 94 |
+
path = Path("\\\\?\\" + resolved_path)
|
| 95 |
+
return path
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@dataclass(frozen=True)
|
| 99 |
+
class LocalUploadFilePaths:
|
| 100 |
+
"""
|
| 101 |
+
Paths to the files related to an upload process in a local dir.
|
| 102 |
+
|
| 103 |
+
Returned by [`get_local_upload_paths`].
|
| 104 |
+
|
| 105 |
+
Attributes:
|
| 106 |
+
path_in_repo (`str`):
|
| 107 |
+
Path of the file in the repo.
|
| 108 |
+
file_path (`Path`):
|
| 109 |
+
Path where the file will be saved.
|
| 110 |
+
lock_path (`Path`):
|
| 111 |
+
Path to the lock file used to ensure atomicity when reading/writing metadata.
|
| 112 |
+
metadata_path (`Path`):
|
| 113 |
+
Path to the metadata file.
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
path_in_repo: str
|
| 117 |
+
file_path: Path
|
| 118 |
+
lock_path: Path
|
| 119 |
+
metadata_path: Path
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@dataclass
|
| 123 |
+
class LocalDownloadFileMetadata:
|
| 124 |
+
"""
|
| 125 |
+
Metadata about a file in the local directory related to a download process.
|
| 126 |
+
|
| 127 |
+
Attributes:
|
| 128 |
+
filename (`str`):
|
| 129 |
+
Path of the file in the repo.
|
| 130 |
+
commit_hash (`str`):
|
| 131 |
+
Commit hash of the file in the repo.
|
| 132 |
+
etag (`str`):
|
| 133 |
+
ETag of the file in the repo. Used to check if the file has changed.
|
| 134 |
+
For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
|
| 135 |
+
timestamp (`int`):
|
| 136 |
+
Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
filename: str
|
| 140 |
+
commit_hash: str
|
| 141 |
+
etag: str
|
| 142 |
+
timestamp: float
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@dataclass
|
| 146 |
+
class LocalUploadFileMetadata:
|
| 147 |
+
"""
|
| 148 |
+
Metadata about a file in the local directory related to an upload process.
|
| 149 |
+
"""
|
| 150 |
+
|
| 151 |
+
size: int
|
| 152 |
+
|
| 153 |
+
# Default values correspond to "we don't know yet"
|
| 154 |
+
timestamp: Optional[float] = None
|
| 155 |
+
should_ignore: Optional[bool] = None
|
| 156 |
+
sha256: Optional[str] = None
|
| 157 |
+
upload_mode: Optional[str] = None
|
| 158 |
+
remote_oid: Optional[str] = None
|
| 159 |
+
is_uploaded: bool = False
|
| 160 |
+
is_committed: bool = False
|
| 161 |
+
|
| 162 |
+
def save(self, paths: LocalUploadFilePaths) -> None:
|
| 163 |
+
"""Save the metadata to disk."""
|
| 164 |
+
with WeakFileLock(paths.lock_path):
|
| 165 |
+
with paths.metadata_path.open("w") as f:
|
| 166 |
+
new_timestamp = time.time()
|
| 167 |
+
f.write(str(new_timestamp) + "\n")
|
| 168 |
+
|
| 169 |
+
f.write(str(self.size)) # never None
|
| 170 |
+
f.write("\n")
|
| 171 |
+
|
| 172 |
+
if self.should_ignore is not None:
|
| 173 |
+
f.write(str(int(self.should_ignore)))
|
| 174 |
+
f.write("\n")
|
| 175 |
+
|
| 176 |
+
if self.sha256 is not None:
|
| 177 |
+
f.write(self.sha256)
|
| 178 |
+
f.write("\n")
|
| 179 |
+
|
| 180 |
+
if self.upload_mode is not None:
|
| 181 |
+
f.write(self.upload_mode)
|
| 182 |
+
f.write("\n")
|
| 183 |
+
|
| 184 |
+
if self.remote_oid is not None:
|
| 185 |
+
f.write(self.remote_oid)
|
| 186 |
+
f.write("\n")
|
| 187 |
+
|
| 188 |
+
f.write(str(int(self.is_uploaded)) + "\n")
|
| 189 |
+
f.write(str(int(self.is_committed)) + "\n")
|
| 190 |
+
|
| 191 |
+
self.timestamp = new_timestamp
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
|
| 195 |
+
"""Compute paths to the files related to a download process.
|
| 196 |
+
|
| 197 |
+
Folders containing the paths are all guaranteed to exist.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
local_dir (`Path`):
|
| 201 |
+
Path to the local directory in which files are downloaded.
|
| 202 |
+
filename (`str`):
|
| 203 |
+
Path of the file in the repo.
|
| 204 |
+
|
| 205 |
+
Return:
|
| 206 |
+
[`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
|
| 207 |
+
"""
|
| 208 |
+
# filename is the path in the Hub repository (separated by '/')
|
| 209 |
+
# make sure to have a cross platform transcription
|
| 210 |
+
sanitized_filename = os.path.join(*filename.split("/"))
|
| 211 |
+
if os.name == "nt":
|
| 212 |
+
if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
|
| 213 |
+
raise ValueError(
|
| 214 |
+
f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
|
| 215 |
+
" owner to rename this file."
|
| 216 |
+
)
|
| 217 |
+
file_path = local_dir / sanitized_filename
|
| 218 |
+
metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
|
| 219 |
+
lock_path = metadata_path.with_suffix(".lock")
|
| 220 |
+
|
| 221 |
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
| 222 |
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix
|
| 223 |
+
if os.name == "nt":
|
| 224 |
+
if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
|
| 225 |
+
file_path = Path("\\\\?\\" + os.path.abspath(file_path))
|
| 226 |
+
lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
|
| 227 |
+
metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
|
| 228 |
+
|
| 229 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 230 |
+
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
| 231 |
+
return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
|
| 235 |
+
"""Compute paths to the files related to an upload process.
|
| 236 |
+
|
| 237 |
+
Folders containing the paths are all guaranteed to exist.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
local_dir (`Path`):
|
| 241 |
+
Path to the local directory that is uploaded.
|
| 242 |
+
filename (`str`):
|
| 243 |
+
Path of the file in the repo.
|
| 244 |
+
|
| 245 |
+
Return:
|
| 246 |
+
[`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
|
| 247 |
+
"""
|
| 248 |
+
# filename is the path in the Hub repository (separated by '/')
|
| 249 |
+
# make sure to have a cross platform transcription
|
| 250 |
+
sanitized_filename = os.path.join(*filename.split("/"))
|
| 251 |
+
if os.name == "nt":
|
| 252 |
+
if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
|
| 253 |
+
raise ValueError(
|
| 254 |
+
f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
|
| 255 |
+
" owner to rename this file."
|
| 256 |
+
)
|
| 257 |
+
file_path = local_dir / sanitized_filename
|
| 258 |
+
metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
|
| 259 |
+
lock_path = metadata_path.with_suffix(".lock")
|
| 260 |
+
|
| 261 |
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
| 262 |
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix
|
| 263 |
+
if os.name == "nt":
|
| 264 |
+
if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
|
| 265 |
+
file_path = Path("\\\\?\\" + os.path.abspath(file_path))
|
| 266 |
+
lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
|
| 267 |
+
metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
|
| 268 |
+
|
| 269 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 270 |
+
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
| 271 |
+
return LocalUploadFilePaths(
|
| 272 |
+
path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
|
| 277 |
+
"""Read metadata about a file in the local directory related to a download process.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
local_dir (`Path`):
|
| 281 |
+
Path to the local directory in which files are downloaded.
|
| 282 |
+
filename (`str`):
|
| 283 |
+
Path of the file in the repo.
|
| 284 |
+
|
| 285 |
+
Return:
|
| 286 |
+
`[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
|
| 287 |
+
"""
|
| 288 |
+
paths = get_local_download_paths(local_dir, filename)
|
| 289 |
+
with WeakFileLock(paths.lock_path):
|
| 290 |
+
if paths.metadata_path.exists():
|
| 291 |
+
try:
|
| 292 |
+
with paths.metadata_path.open() as f:
|
| 293 |
+
commit_hash = f.readline().strip()
|
| 294 |
+
etag = f.readline().strip()
|
| 295 |
+
timestamp = float(f.readline().strip())
|
| 296 |
+
metadata = LocalDownloadFileMetadata(
|
| 297 |
+
filename=filename,
|
| 298 |
+
commit_hash=commit_hash,
|
| 299 |
+
etag=etag,
|
| 300 |
+
timestamp=timestamp,
|
| 301 |
+
)
|
| 302 |
+
except Exception as e:
|
| 303 |
+
# remove the metadata file if it is corrupted / not the right format
|
| 304 |
+
logger.warning(
|
| 305 |
+
f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
|
| 306 |
+
)
|
| 307 |
+
try:
|
| 308 |
+
paths.metadata_path.unlink()
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
|
| 311 |
+
|
| 312 |
+
try:
|
| 313 |
+
# check if the file exists and hasn't been modified since the metadata was saved
|
| 314 |
+
stat = paths.file_path.stat()
|
| 315 |
+
if (
|
| 316 |
+
stat.st_mtime - 1 <= metadata.timestamp
|
| 317 |
+
): # allow 1s difference as stat.st_mtime might not be precise
|
| 318 |
+
return metadata
|
| 319 |
+
logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
|
| 320 |
+
except FileNotFoundError:
|
| 321 |
+
# file does not exist => metadata is outdated
|
| 322 |
+
return None
|
| 323 |
+
return None
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
|
| 327 |
+
"""Read metadata about a file in the local directory related to an upload process.
|
| 328 |
+
|
| 329 |
+
TODO: factorize logic with `read_download_metadata`.
|
| 330 |
+
|
| 331 |
+
Args:
|
| 332 |
+
local_dir (`Path`):
|
| 333 |
+
Path to the local directory in which files are downloaded.
|
| 334 |
+
filename (`str`):
|
| 335 |
+
Path of the file in the repo.
|
| 336 |
+
|
| 337 |
+
Return:
|
| 338 |
+
`[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
|
| 339 |
+
"""
|
| 340 |
+
paths = get_local_upload_paths(local_dir, filename)
|
| 341 |
+
with WeakFileLock(paths.lock_path):
|
| 342 |
+
if paths.metadata_path.exists():
|
| 343 |
+
try:
|
| 344 |
+
with paths.metadata_path.open() as f:
|
| 345 |
+
timestamp = float(f.readline().strip())
|
| 346 |
+
|
| 347 |
+
size = int(f.readline().strip()) # never None
|
| 348 |
+
|
| 349 |
+
_should_ignore = f.readline().strip()
|
| 350 |
+
should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
|
| 351 |
+
|
| 352 |
+
_sha256 = f.readline().strip()
|
| 353 |
+
sha256 = None if _sha256 == "" else _sha256
|
| 354 |
+
|
| 355 |
+
_upload_mode = f.readline().strip()
|
| 356 |
+
upload_mode = None if _upload_mode == "" else _upload_mode
|
| 357 |
+
if upload_mode not in (None, "regular", "lfs"):
|
| 358 |
+
raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
|
| 359 |
+
|
| 360 |
+
_remote_oid = f.readline().strip()
|
| 361 |
+
remote_oid = None if _remote_oid == "" else _remote_oid
|
| 362 |
+
|
| 363 |
+
is_uploaded = bool(int(f.readline().strip()))
|
| 364 |
+
is_committed = bool(int(f.readline().strip()))
|
| 365 |
+
|
| 366 |
+
metadata = LocalUploadFileMetadata(
|
| 367 |
+
timestamp=timestamp,
|
| 368 |
+
size=size,
|
| 369 |
+
should_ignore=should_ignore,
|
| 370 |
+
sha256=sha256,
|
| 371 |
+
upload_mode=upload_mode,
|
| 372 |
+
remote_oid=remote_oid,
|
| 373 |
+
is_uploaded=is_uploaded,
|
| 374 |
+
is_committed=is_committed,
|
| 375 |
+
)
|
| 376 |
+
except Exception as e:
|
| 377 |
+
# remove the metadata file if it is corrupted / not the right format
|
| 378 |
+
logger.warning(
|
| 379 |
+
f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
|
| 380 |
+
)
|
| 381 |
+
try:
|
| 382 |
+
paths.metadata_path.unlink()
|
| 383 |
+
except Exception as e:
|
| 384 |
+
logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
|
| 385 |
+
|
| 386 |
+
# TODO: can we do better?
|
| 387 |
+
if (
|
| 388 |
+
metadata.timestamp is not None
|
| 389 |
+
and metadata.is_uploaded # file was uploaded
|
| 390 |
+
and not metadata.is_committed # but not committed
|
| 391 |
+
and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
|
| 392 |
+
): # => we consider it as garbage-collected by S3
|
| 393 |
+
metadata.is_uploaded = False
|
| 394 |
+
|
| 395 |
+
# check if the file exists and hasn't been modified since the metadata was saved
|
| 396 |
+
try:
|
| 397 |
+
if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
|
| 398 |
+
return metadata
|
| 399 |
+
logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
|
| 400 |
+
except FileNotFoundError:
|
| 401 |
+
# file does not exist => metadata is outdated
|
| 402 |
+
pass
|
| 403 |
+
|
| 404 |
+
# empty metadata => we don't know anything expect its size
|
| 405 |
+
return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
|
| 409 |
+
"""Write metadata about a file in the local directory related to a download process.
|
| 410 |
+
|
| 411 |
+
Args:
|
| 412 |
+
local_dir (`Path`):
|
| 413 |
+
Path to the local directory in which files are downloaded.
|
| 414 |
+
"""
|
| 415 |
+
paths = get_local_download_paths(local_dir, filename)
|
| 416 |
+
with WeakFileLock(paths.lock_path):
|
| 417 |
+
with paths.metadata_path.open("w") as f:
|
| 418 |
+
f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def _huggingface_dir(local_dir: Path) -> Path:
|
| 422 |
+
"""Return the path to the `.cache/huggingface` directory in a local directory."""
|
| 423 |
+
# Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
|
| 424 |
+
path = local_dir / ".cache" / "huggingface"
|
| 425 |
+
path.mkdir(exist_ok=True, parents=True)
|
| 426 |
+
|
| 427 |
+
# Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
|
| 428 |
+
# Should be thread-safe enough like this.
|
| 429 |
+
gitignore = path / ".gitignore"
|
| 430 |
+
gitignore_lock = path / ".gitignore.lock"
|
| 431 |
+
if not gitignore.exists():
|
| 432 |
+
try:
|
| 433 |
+
with WeakFileLock(gitignore_lock, timeout=0.1):
|
| 434 |
+
gitignore.write_text("*")
|
| 435 |
+
except IndexError:
|
| 436 |
+
pass
|
| 437 |
+
except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
|
| 438 |
+
pass
|
| 439 |
+
try:
|
| 440 |
+
gitignore_lock.unlink()
|
| 441 |
+
except OSError:
|
| 442 |
+
pass
|
| 443 |
+
return path
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def _short_hash(filename: str) -> str:
|
| 447 |
+
return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()
|
venv/lib/python3.13/site-packages/huggingface_hub/_login.py
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""Contains methods to log in to the Hub."""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import subprocess
|
| 18 |
+
from getpass import getpass
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Optional
|
| 21 |
+
|
| 22 |
+
from . import constants
|
| 23 |
+
from .commands._cli_utils import ANSI
|
| 24 |
+
from .utils import (
|
| 25 |
+
capture_output,
|
| 26 |
+
get_token,
|
| 27 |
+
is_google_colab,
|
| 28 |
+
is_notebook,
|
| 29 |
+
list_credential_helpers,
|
| 30 |
+
logging,
|
| 31 |
+
run_subprocess,
|
| 32 |
+
set_git_credential,
|
| 33 |
+
unset_git_credential,
|
| 34 |
+
)
|
| 35 |
+
from .utils._auth import (
|
| 36 |
+
_get_token_by_name,
|
| 37 |
+
_get_token_from_environment,
|
| 38 |
+
_get_token_from_file,
|
| 39 |
+
_get_token_from_google_colab,
|
| 40 |
+
_save_stored_tokens,
|
| 41 |
+
_save_token,
|
| 42 |
+
get_stored_tokens,
|
| 43 |
+
)
|
| 44 |
+
from .utils._deprecation import _deprecate_arguments, _deprecate_positional_args
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
logger = logging.get_logger(__name__)
|
| 48 |
+
|
| 49 |
+
_HF_LOGO_ASCII = """
|
| 50 |
+
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
| 51 |
+
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
| 52 |
+
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
| 53 |
+
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
| 54 |
+
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@_deprecate_arguments(
|
| 59 |
+
version="1.0",
|
| 60 |
+
deprecated_args="write_permission",
|
| 61 |
+
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
| 62 |
+
)
|
| 63 |
+
@_deprecate_positional_args(version="1.0")
|
| 64 |
+
def login(
|
| 65 |
+
token: Optional[str] = None,
|
| 66 |
+
*,
|
| 67 |
+
add_to_git_credential: bool = False,
|
| 68 |
+
new_session: bool = True,
|
| 69 |
+
write_permission: bool = False,
|
| 70 |
+
) -> None:
|
| 71 |
+
"""Login the machine to access the Hub.
|
| 72 |
+
|
| 73 |
+
The `token` is persisted in cache and set as a git credential. Once done, the machine
|
| 74 |
+
is logged in and the access token will be available across all `huggingface_hub`
|
| 75 |
+
components. If `token` is not provided, it will be prompted to the user either with
|
| 76 |
+
a widget (in a notebook) or via the terminal.
|
| 77 |
+
|
| 78 |
+
To log in from outside of a script, one can also use `hf auth login` which is
|
| 79 |
+
a cli command that wraps [`login`].
|
| 80 |
+
|
| 81 |
+
> [!TIP]
|
| 82 |
+
> [`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and
|
| 83 |
+
> extends its capabilities.
|
| 84 |
+
|
| 85 |
+
> [!TIP]
|
| 86 |
+
> When the token is not passed, [`login`] will automatically detect if the script runs
|
| 87 |
+
> in a notebook or not. However, this detection might not be accurate due to the
|
| 88 |
+
> variety of notebooks that exists nowadays. If that is the case, you can always force
|
| 89 |
+
> the UI by using [`notebook_login`] or [`interpreter_login`].
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
token (`str`, *optional*):
|
| 93 |
+
User access token to generate from https://huggingface.co/settings/token.
|
| 94 |
+
add_to_git_credential (`bool`, defaults to `False`):
|
| 95 |
+
If `True`, token will be set as git credential. If no git credential helper
|
| 96 |
+
is configured, a warning will be displayed to the user. If `token` is `None`,
|
| 97 |
+
the value of `add_to_git_credential` is ignored and will be prompted again
|
| 98 |
+
to the end user.
|
| 99 |
+
new_session (`bool`, defaults to `True`):
|
| 100 |
+
If `True`, will request a token even if one is already saved on the machine.
|
| 101 |
+
write_permission (`bool`):
|
| 102 |
+
Ignored and deprecated argument.
|
| 103 |
+
Raises:
|
| 104 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 105 |
+
If an organization token is passed. Only personal account tokens are valid
|
| 106 |
+
to log in.
|
| 107 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 108 |
+
If token is invalid.
|
| 109 |
+
[`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
| 110 |
+
If running in a notebook but `ipywidgets` is not installed.
|
| 111 |
+
"""
|
| 112 |
+
if token is not None:
|
| 113 |
+
if not add_to_git_credential:
|
| 114 |
+
logger.info(
|
| 115 |
+
"The token has not been saved to the git credentials helper. Pass "
|
| 116 |
+
"`add_to_git_credential=True` in this function directly or "
|
| 117 |
+
"`--add-to-git-credential` if using via `hf`CLI if "
|
| 118 |
+
"you want to set the git credential as well."
|
| 119 |
+
)
|
| 120 |
+
_login(token, add_to_git_credential=add_to_git_credential)
|
| 121 |
+
elif is_notebook():
|
| 122 |
+
notebook_login(new_session=new_session)
|
| 123 |
+
else:
|
| 124 |
+
interpreter_login(new_session=new_session)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def logout(token_name: Optional[str] = None) -> None:
|
| 128 |
+
"""Logout the machine from the Hub.
|
| 129 |
+
|
| 130 |
+
Token is deleted from the machine and removed from git credential.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
token_name (`str`, *optional*):
|
| 134 |
+
Name of the access token to logout from. If `None`, will logout from all saved access tokens.
|
| 135 |
+
Raises:
|
| 136 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
| 137 |
+
If the access token name is not found.
|
| 138 |
+
"""
|
| 139 |
+
if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens
|
| 140 |
+
logger.warning("Not logged in!")
|
| 141 |
+
return
|
| 142 |
+
if not token_name:
|
| 143 |
+
# Delete all saved access tokens and token
|
| 144 |
+
for file_path in (constants.HF_TOKEN_PATH, constants.HF_STORED_TOKENS_PATH):
|
| 145 |
+
try:
|
| 146 |
+
Path(file_path).unlink()
|
| 147 |
+
except FileNotFoundError:
|
| 148 |
+
pass
|
| 149 |
+
logger.info("Successfully logged out from all access tokens.")
|
| 150 |
+
else:
|
| 151 |
+
_logout_from_token(token_name)
|
| 152 |
+
logger.info(f"Successfully logged out from access token: {token_name}.")
|
| 153 |
+
|
| 154 |
+
unset_git_credential()
|
| 155 |
+
|
| 156 |
+
# Check if still logged in
|
| 157 |
+
if _get_token_from_google_colab() is not None:
|
| 158 |
+
raise EnvironmentError(
|
| 159 |
+
"You are automatically logged in using a Google Colab secret.\n"
|
| 160 |
+
"To log out, you must unset the `HF_TOKEN` secret in your Colab settings."
|
| 161 |
+
)
|
| 162 |
+
if _get_token_from_environment() is not None:
|
| 163 |
+
raise EnvironmentError(
|
| 164 |
+
"Token has been deleted from your machine but you are still logged in.\n"
|
| 165 |
+
"To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables."
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None:
|
| 170 |
+
"""Switch to a different access token.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
token_name (`str`):
|
| 174 |
+
Name of the access token to switch to.
|
| 175 |
+
add_to_git_credential (`bool`, defaults to `False`):
|
| 176 |
+
If `True`, token will be set as git credential. If no git credential helper
|
| 177 |
+
is configured, a warning will be displayed to the user. If `token` is `None`,
|
| 178 |
+
the value of `add_to_git_credential` is ignored and will be prompted again
|
| 179 |
+
to the end user.
|
| 180 |
+
|
| 181 |
+
Raises:
|
| 182 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
| 183 |
+
If the access token name is not found.
|
| 184 |
+
"""
|
| 185 |
+
token = _get_token_by_name(token_name)
|
| 186 |
+
if not token:
|
| 187 |
+
raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
|
| 188 |
+
# Write token to HF_TOKEN_PATH
|
| 189 |
+
_set_active_token(token_name, add_to_git_credential)
|
| 190 |
+
logger.info(f"The current active token is: {token_name}")
|
| 191 |
+
token_from_environment = _get_token_from_environment()
|
| 192 |
+
if token_from_environment is not None and token_from_environment != token:
|
| 193 |
+
logger.warning(
|
| 194 |
+
"The environment variable `HF_TOKEN` is set and will override the access token you've just switched to."
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def auth_list() -> None:
|
| 199 |
+
"""List all stored access tokens."""
|
| 200 |
+
tokens = get_stored_tokens()
|
| 201 |
+
|
| 202 |
+
if not tokens:
|
| 203 |
+
logger.info("No access tokens found.")
|
| 204 |
+
return
|
| 205 |
+
# Find current token
|
| 206 |
+
current_token = get_token()
|
| 207 |
+
current_token_name = None
|
| 208 |
+
for token_name in tokens:
|
| 209 |
+
if tokens.get(token_name) == current_token:
|
| 210 |
+
current_token_name = token_name
|
| 211 |
+
# Print header
|
| 212 |
+
max_offset = max(len("token"), max(len(token) for token in tokens)) + 2
|
| 213 |
+
print(f" {{:<{max_offset}}}| {{:<15}}".format("name", "token"))
|
| 214 |
+
print("-" * (max_offset + 2) + "|" + "-" * 15)
|
| 215 |
+
|
| 216 |
+
# Print saved access tokens
|
| 217 |
+
for token_name in tokens:
|
| 218 |
+
token = tokens.get(token_name, "<not set>")
|
| 219 |
+
masked_token = f"{token[:3]}****{token[-4:]}" if token != "<not set>" else token
|
| 220 |
+
is_current = "*" if token == current_token else " "
|
| 221 |
+
|
| 222 |
+
print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token))
|
| 223 |
+
|
| 224 |
+
if _get_token_from_environment():
|
| 225 |
+
logger.warning(
|
| 226 |
+
"\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above."
|
| 227 |
+
)
|
| 228 |
+
elif current_token_name is None:
|
| 229 |
+
logger.warning(
|
| 230 |
+
"\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `hf auth login` to log in."
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
###
|
| 235 |
+
# Interpreter-based login (text)
|
| 236 |
+
###
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
@_deprecate_arguments(
|
| 240 |
+
version="1.0",
|
| 241 |
+
deprecated_args="write_permission",
|
| 242 |
+
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
| 243 |
+
)
|
| 244 |
+
@_deprecate_positional_args(version="1.0")
|
| 245 |
+
def interpreter_login(*, new_session: bool = True, write_permission: bool = False) -> None:
|
| 246 |
+
"""
|
| 247 |
+
Displays a prompt to log in to the HF website and store the token.
|
| 248 |
+
|
| 249 |
+
This is equivalent to [`login`] without passing a token when not run in a notebook.
|
| 250 |
+
[`interpreter_login`] is useful if you want to force the use of the terminal prompt
|
| 251 |
+
instead of a notebook widget.
|
| 252 |
+
|
| 253 |
+
For more details, see [`login`].
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
new_session (`bool`, defaults to `True`):
|
| 257 |
+
If `True`, will request a token even if one is already saved on the machine.
|
| 258 |
+
write_permission (`bool`):
|
| 259 |
+
Ignored and deprecated argument.
|
| 260 |
+
"""
|
| 261 |
+
if not new_session and get_token() is not None:
|
| 262 |
+
logger.info("User is already logged in.")
|
| 263 |
+
return
|
| 264 |
+
|
| 265 |
+
from .commands.delete_cache import _ask_for_confirmation_no_tui
|
| 266 |
+
|
| 267 |
+
print(_HF_LOGO_ASCII)
|
| 268 |
+
if get_token() is not None:
|
| 269 |
+
logger.info(
|
| 270 |
+
" A token is already saved on your machine. Run `hf auth whoami`"
|
| 271 |
+
" to get more information or `hf auth logout` if you want"
|
| 272 |
+
" to log out."
|
| 273 |
+
)
|
| 274 |
+
logger.info(" Setting a new token will erase the existing one.")
|
| 275 |
+
|
| 276 |
+
logger.info(
|
| 277 |
+
" To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ."
|
| 278 |
+
)
|
| 279 |
+
if os.name == "nt":
|
| 280 |
+
logger.info("Token can be pasted using 'Right-Click'.")
|
| 281 |
+
token = getpass("Enter your token (input will not be visible): ")
|
| 282 |
+
add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?")
|
| 283 |
+
|
| 284 |
+
_login(token=token, add_to_git_credential=add_to_git_credential)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
###
|
| 288 |
+
# Notebook-based login (widget)
|
| 289 |
+
###
|
| 290 |
+
|
| 291 |
+
NOTEBOOK_LOGIN_PASSWORD_HTML = """<center> <img
|
| 292 |
+
src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
|
| 293 |
+
alt='Hugging Face'> <br> Immediately click login after typing your password or
|
| 294 |
+
it might be stored in plain text in this notebook file. </center>"""
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
NOTEBOOK_LOGIN_TOKEN_HTML_START = """<center> <img
|
| 298 |
+
src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
|
| 299 |
+
alt='Hugging Face'> <br> Copy a token from <a
|
| 300 |
+
href="https://huggingface.co/settings/tokens" target="_blank">your Hugging Face
|
| 301 |
+
tokens page</a> and paste it below. <br> Immediately click login after copying
|
| 302 |
+
your token or it might be stored in plain text in this notebook file. </center>"""
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
NOTEBOOK_LOGIN_TOKEN_HTML_END = """
|
| 306 |
+
<b>Pro Tip:</b> If you don't already have one, you can create a dedicated
|
| 307 |
+
'notebooks' token with 'write' access, that you can then easily reuse for all
|
| 308 |
+
notebooks. </center>"""
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@_deprecate_arguments(
|
| 312 |
+
version="1.0",
|
| 313 |
+
deprecated_args="write_permission",
|
| 314 |
+
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
| 315 |
+
)
|
| 316 |
+
@_deprecate_positional_args(version="1.0")
|
| 317 |
+
def notebook_login(*, new_session: bool = True, write_permission: bool = False) -> None:
|
| 318 |
+
"""
|
| 319 |
+
Displays a widget to log in to the HF website and store the token.
|
| 320 |
+
|
| 321 |
+
This is equivalent to [`login`] without passing a token when run in a notebook.
|
| 322 |
+
[`notebook_login`] is useful if you want to force the use of the notebook widget
|
| 323 |
+
instead of a prompt in the terminal.
|
| 324 |
+
|
| 325 |
+
For more details, see [`login`].
|
| 326 |
+
|
| 327 |
+
Args:
|
| 328 |
+
new_session (`bool`, defaults to `True`):
|
| 329 |
+
If `True`, will request a token even if one is already saved on the machine.
|
| 330 |
+
write_permission (`bool`):
|
| 331 |
+
Ignored and deprecated argument.
|
| 332 |
+
"""
|
| 333 |
+
try:
|
| 334 |
+
import ipywidgets.widgets as widgets # type: ignore
|
| 335 |
+
from IPython.display import display # type: ignore
|
| 336 |
+
except ImportError:
|
| 337 |
+
raise ImportError(
|
| 338 |
+
"The `notebook_login` function can only be used in a notebook (Jupyter or"
|
| 339 |
+
" Colab) and you need the `ipywidgets` module: `pip install ipywidgets`."
|
| 340 |
+
)
|
| 341 |
+
if not new_session and get_token() is not None:
|
| 342 |
+
logger.info("User is already logged in.")
|
| 343 |
+
return
|
| 344 |
+
|
| 345 |
+
box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%")
|
| 346 |
+
|
| 347 |
+
token_widget = widgets.Password(description="Token:")
|
| 348 |
+
git_checkbox_widget = widgets.Checkbox(value=True, description="Add token as git credential?")
|
| 349 |
+
token_finish_button = widgets.Button(description="Login")
|
| 350 |
+
|
| 351 |
+
login_token_widget = widgets.VBox(
|
| 352 |
+
[
|
| 353 |
+
widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_START),
|
| 354 |
+
token_widget,
|
| 355 |
+
git_checkbox_widget,
|
| 356 |
+
token_finish_button,
|
| 357 |
+
widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_END),
|
| 358 |
+
],
|
| 359 |
+
layout=box_layout,
|
| 360 |
+
)
|
| 361 |
+
display(login_token_widget)
|
| 362 |
+
|
| 363 |
+
# On click events
|
| 364 |
+
def login_token_event(t):
|
| 365 |
+
"""Event handler for the login button."""
|
| 366 |
+
token = token_widget.value
|
| 367 |
+
add_to_git_credential = git_checkbox_widget.value
|
| 368 |
+
# Erase token and clear value to make sure it's not saved in the notebook.
|
| 369 |
+
token_widget.value = ""
|
| 370 |
+
# Hide inputs
|
| 371 |
+
login_token_widget.children = [widgets.Label("Connecting...")]
|
| 372 |
+
try:
|
| 373 |
+
with capture_output() as captured:
|
| 374 |
+
_login(token, add_to_git_credential=add_to_git_credential)
|
| 375 |
+
message = captured.getvalue()
|
| 376 |
+
except Exception as error:
|
| 377 |
+
message = str(error)
|
| 378 |
+
# Print result (success message or error)
|
| 379 |
+
login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()]
|
| 380 |
+
|
| 381 |
+
token_finish_button.on_click(login_token_event)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
###
|
| 385 |
+
# Login private helpers
|
| 386 |
+
###
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def _login(
|
| 390 |
+
token: str,
|
| 391 |
+
add_to_git_credential: bool,
|
| 392 |
+
) -> None:
|
| 393 |
+
from .hf_api import whoami # avoid circular import
|
| 394 |
+
|
| 395 |
+
if token.startswith("api_org"):
|
| 396 |
+
raise ValueError("You must use your personal account token, not an organization token.")
|
| 397 |
+
|
| 398 |
+
token_info = whoami(token)
|
| 399 |
+
permission = token_info["auth"]["accessToken"]["role"]
|
| 400 |
+
logger.info(f"Token is valid (permission: {permission}).")
|
| 401 |
+
|
| 402 |
+
token_name = token_info["auth"]["accessToken"]["displayName"]
|
| 403 |
+
# Store token locally
|
| 404 |
+
_save_token(token=token, token_name=token_name)
|
| 405 |
+
# Set active token
|
| 406 |
+
_set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential)
|
| 407 |
+
logger.info("Login successful.")
|
| 408 |
+
if _get_token_from_environment():
|
| 409 |
+
logger.warning(
|
| 410 |
+
"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured."
|
| 411 |
+
)
|
| 412 |
+
else:
|
| 413 |
+
logger.info(f"The current active token is: `{token_name}`")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def _logout_from_token(token_name: str) -> None:
|
| 417 |
+
"""Logout from a specific access token.
|
| 418 |
+
|
| 419 |
+
Args:
|
| 420 |
+
token_name (`str`):
|
| 421 |
+
The name of the access token to logout from.
|
| 422 |
+
Raises:
|
| 423 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
| 424 |
+
If the access token name is not found.
|
| 425 |
+
"""
|
| 426 |
+
stored_tokens = get_stored_tokens()
|
| 427 |
+
# If there is no access tokens saved or the access token name is not found, do nothing
|
| 428 |
+
if not stored_tokens or token_name not in stored_tokens:
|
| 429 |
+
return
|
| 430 |
+
|
| 431 |
+
token = stored_tokens.pop(token_name)
|
| 432 |
+
_save_stored_tokens(stored_tokens)
|
| 433 |
+
|
| 434 |
+
if token == _get_token_from_file():
|
| 435 |
+
logger.warning(f"Active token '{token_name}' has been deleted.")
|
| 436 |
+
Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True)
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
def _set_active_token(
|
| 440 |
+
token_name: str,
|
| 441 |
+
add_to_git_credential: bool,
|
| 442 |
+
) -> None:
|
| 443 |
+
"""Set the active access token.
|
| 444 |
+
|
| 445 |
+
Args:
|
| 446 |
+
token_name (`str`):
|
| 447 |
+
The name of the token to set as active.
|
| 448 |
+
"""
|
| 449 |
+
token = _get_token_by_name(token_name)
|
| 450 |
+
if not token:
|
| 451 |
+
raise ValueError(f"Token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
|
| 452 |
+
if add_to_git_credential:
|
| 453 |
+
if _is_git_credential_helper_configured():
|
| 454 |
+
set_git_credential(token)
|
| 455 |
+
logger.info(
|
| 456 |
+
"Your token has been saved in your configured git credential helpers"
|
| 457 |
+
+ f" ({','.join(list_credential_helpers())})."
|
| 458 |
+
)
|
| 459 |
+
else:
|
| 460 |
+
logger.warning("Token has not been saved to git credential helper.")
|
| 461 |
+
# Write token to HF_TOKEN_PATH
|
| 462 |
+
path = Path(constants.HF_TOKEN_PATH)
|
| 463 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 464 |
+
path.write_text(token)
|
| 465 |
+
logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}")
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
def _is_git_credential_helper_configured() -> bool:
|
| 469 |
+
"""Check if a git credential helper is configured.
|
| 470 |
+
|
| 471 |
+
Warns user if not the case (except for Google Colab where "store" is set by default
|
| 472 |
+
by `huggingface_hub`).
|
| 473 |
+
"""
|
| 474 |
+
helpers = list_credential_helpers()
|
| 475 |
+
if len(helpers) > 0:
|
| 476 |
+
return True # Do not warn: at least 1 helper is set
|
| 477 |
+
|
| 478 |
+
# Only in Google Colab to avoid the warning message
|
| 479 |
+
# See https://github.com/huggingface/huggingface_hub/issues/1043#issuecomment-1247010710
|
| 480 |
+
if is_google_colab():
|
| 481 |
+
_set_store_as_git_credential_helper_globally()
|
| 482 |
+
return True # Do not warn: "store" is used by default in Google Colab
|
| 483 |
+
|
| 484 |
+
# Otherwise, warn user
|
| 485 |
+
print(
|
| 486 |
+
ANSI.red(
|
| 487 |
+
"Cannot authenticate through git-credential as no helper is defined on your"
|
| 488 |
+
" machine.\nYou might have to re-authenticate when pushing to the Hugging"
|
| 489 |
+
" Face Hub.\nRun the following command in your terminal in case you want to"
|
| 490 |
+
" set the 'store' credential helper as default.\n\ngit config --global"
|
| 491 |
+
" credential.helper store\n\nRead"
|
| 492 |
+
" https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more"
|
| 493 |
+
" details."
|
| 494 |
+
)
|
| 495 |
+
)
|
| 496 |
+
return False
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def _set_store_as_git_credential_helper_globally() -> None:
|
| 500 |
+
"""Set globally the credential.helper to `store`.
|
| 501 |
+
|
| 502 |
+
To be used only in Google Colab as we assume the user doesn't care about the git
|
| 503 |
+
credential config. It is the only particular case where we don't want to display the
|
| 504 |
+
warning message in [`notebook_login()`].
|
| 505 |
+
|
| 506 |
+
Related:
|
| 507 |
+
- https://github.com/huggingface/huggingface_hub/issues/1043
|
| 508 |
+
- https://github.com/huggingface/huggingface_hub/issues/1051
|
| 509 |
+
- https://git-scm.com/docs/git-credential-store
|
| 510 |
+
"""
|
| 511 |
+
try:
|
| 512 |
+
run_subprocess("git config --global credential.helper store")
|
| 513 |
+
except subprocess.CalledProcessError as exc:
|
| 514 |
+
raise EnvironmentError(exc.stderr)
|
venv/lib/python3.13/site-packages/huggingface_hub/_oauth.py
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import hashlib
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
import urllib.parse
|
| 7 |
+
import warnings
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
|
| 10 |
+
|
| 11 |
+
from . import constants
|
| 12 |
+
from .hf_api import whoami
|
| 13 |
+
from .utils import experimental, get_token
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
if TYPE_CHECKING:
|
| 19 |
+
import fastapi
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class OAuthOrgInfo:
|
| 24 |
+
"""
|
| 25 |
+
Information about an organization linked to a user logged in with OAuth.
|
| 26 |
+
|
| 27 |
+
Attributes:
|
| 28 |
+
sub (`str`):
|
| 29 |
+
Unique identifier for the org. OpenID Connect field.
|
| 30 |
+
name (`str`):
|
| 31 |
+
The org's full name. OpenID Connect field.
|
| 32 |
+
preferred_username (`str`):
|
| 33 |
+
The org's username. OpenID Connect field.
|
| 34 |
+
picture (`str`):
|
| 35 |
+
The org's profile picture URL. OpenID Connect field.
|
| 36 |
+
is_enterprise (`bool`):
|
| 37 |
+
Whether the org is an enterprise org. Hugging Face field.
|
| 38 |
+
can_pay (`Optional[bool]`, *optional*):
|
| 39 |
+
Whether the org has a payment method set up. Hugging Face field.
|
| 40 |
+
role_in_org (`Optional[str]`, *optional*):
|
| 41 |
+
The user's role in the org. Hugging Face field.
|
| 42 |
+
security_restrictions (`Optional[List[Literal["ip", "token-policy", "mfa", "sso"]]]`, *optional*):
|
| 43 |
+
Array of security restrictions that the user hasn't completed for this org. Possible values: "ip", "token-policy", "mfa", "sso". Hugging Face field.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
sub: str
|
| 47 |
+
name: str
|
| 48 |
+
preferred_username: str
|
| 49 |
+
picture: str
|
| 50 |
+
is_enterprise: bool
|
| 51 |
+
can_pay: Optional[bool] = None
|
| 52 |
+
role_in_org: Optional[str] = None
|
| 53 |
+
security_restrictions: Optional[List[Literal["ip", "token-policy", "mfa", "sso"]]] = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class OAuthUserInfo:
|
| 58 |
+
"""
|
| 59 |
+
Information about a user logged in with OAuth.
|
| 60 |
+
|
| 61 |
+
Attributes:
|
| 62 |
+
sub (`str`):
|
| 63 |
+
Unique identifier for the user, even in case of rename. OpenID Connect field.
|
| 64 |
+
name (`str`):
|
| 65 |
+
The user's full name. OpenID Connect field.
|
| 66 |
+
preferred_username (`str`):
|
| 67 |
+
The user's username. OpenID Connect field.
|
| 68 |
+
email_verified (`Optional[bool]`, *optional*):
|
| 69 |
+
Indicates if the user's email is verified. OpenID Connect field.
|
| 70 |
+
email (`Optional[str]`, *optional*):
|
| 71 |
+
The user's email address. OpenID Connect field.
|
| 72 |
+
picture (`str`):
|
| 73 |
+
The user's profile picture URL. OpenID Connect field.
|
| 74 |
+
profile (`str`):
|
| 75 |
+
The user's profile URL. OpenID Connect field.
|
| 76 |
+
website (`Optional[str]`, *optional*):
|
| 77 |
+
The user's website URL. OpenID Connect field.
|
| 78 |
+
is_pro (`bool`):
|
| 79 |
+
Whether the user is a pro user. Hugging Face field.
|
| 80 |
+
can_pay (`Optional[bool]`, *optional*):
|
| 81 |
+
Whether the user has a payment method set up. Hugging Face field.
|
| 82 |
+
orgs (`Optional[List[OrgInfo]]`, *optional*):
|
| 83 |
+
List of organizations the user is part of. Hugging Face field.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
sub: str
|
| 87 |
+
name: str
|
| 88 |
+
preferred_username: str
|
| 89 |
+
email_verified: Optional[bool]
|
| 90 |
+
email: Optional[str]
|
| 91 |
+
picture: str
|
| 92 |
+
profile: str
|
| 93 |
+
website: Optional[str]
|
| 94 |
+
is_pro: bool
|
| 95 |
+
can_pay: Optional[bool]
|
| 96 |
+
orgs: Optional[List[OAuthOrgInfo]]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@dataclass
|
| 100 |
+
class OAuthInfo:
|
| 101 |
+
"""
|
| 102 |
+
Information about the OAuth login.
|
| 103 |
+
|
| 104 |
+
Attributes:
|
| 105 |
+
access_token (`str`):
|
| 106 |
+
The access token.
|
| 107 |
+
access_token_expires_at (`datetime.datetime`):
|
| 108 |
+
The expiration date of the access token.
|
| 109 |
+
user_info ([`OAuthUserInfo`]):
|
| 110 |
+
The user information.
|
| 111 |
+
state (`str`, *optional*):
|
| 112 |
+
State passed to the OAuth provider in the original request to the OAuth provider.
|
| 113 |
+
scope (`str`):
|
| 114 |
+
Granted scope.
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
access_token: str
|
| 118 |
+
access_token_expires_at: datetime.datetime
|
| 119 |
+
user_info: OAuthUserInfo
|
| 120 |
+
state: Optional[str]
|
| 121 |
+
scope: str
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@experimental
|
| 125 |
+
def attach_huggingface_oauth(app: "fastapi.FastAPI", route_prefix: str = "/"):
|
| 126 |
+
"""
|
| 127 |
+
Add OAuth endpoints to a FastAPI app to enable OAuth login with Hugging Face.
|
| 128 |
+
|
| 129 |
+
How to use:
|
| 130 |
+
- Call this method on your FastAPI app to add the OAuth endpoints.
|
| 131 |
+
- Inside your route handlers, call `parse_huggingface_oauth(request)` to retrieve the OAuth info.
|
| 132 |
+
- If user is logged in, an [`OAuthInfo`] object is returned with the user's info. If not, `None` is returned.
|
| 133 |
+
- In your app, make sure to add links to `/oauth/huggingface/login` and `/oauth/huggingface/logout` for the user to log in and out.
|
| 134 |
+
|
| 135 |
+
Example:
|
| 136 |
+
```py
|
| 137 |
+
from huggingface_hub import attach_huggingface_oauth, parse_huggingface_oauth
|
| 138 |
+
|
| 139 |
+
# Create a FastAPI app
|
| 140 |
+
app = FastAPI()
|
| 141 |
+
|
| 142 |
+
# Add OAuth endpoints to the FastAPI app
|
| 143 |
+
attach_huggingface_oauth(app)
|
| 144 |
+
|
| 145 |
+
# Add a route that greets the user if they are logged in
|
| 146 |
+
@app.get("/")
|
| 147 |
+
def greet_json(request: Request):
|
| 148 |
+
# Retrieve the OAuth info from the request
|
| 149 |
+
oauth_info = parse_huggingface_oauth(request) # e.g. OAuthInfo dataclass
|
| 150 |
+
if oauth_info is None:
|
| 151 |
+
return {"msg": "Not logged in!"}
|
| 152 |
+
return {"msg": f"Hello, {oauth_info.user_info.preferred_username}!"}
|
| 153 |
+
```
|
| 154 |
+
"""
|
| 155 |
+
# TODO: handle generic case (handling OAuth in a non-Space environment with custom dev values) (low priority)
|
| 156 |
+
|
| 157 |
+
# Add SessionMiddleware to the FastAPI app to store the OAuth info in the session.
|
| 158 |
+
# Session Middleware requires a secret key to sign the cookies. Let's use a hash
|
| 159 |
+
# of the OAuth secret key to make it unique to the Space + updated in case OAuth
|
| 160 |
+
# config gets updated. When ran locally, we use an empty string as a secret key.
|
| 161 |
+
try:
|
| 162 |
+
from starlette.middleware.sessions import SessionMiddleware
|
| 163 |
+
except ImportError as e:
|
| 164 |
+
raise ImportError(
|
| 165 |
+
"Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
|
| 166 |
+
"`huggingface_hub[oauth]` to your requirements.txt file in order to install the required dependencies."
|
| 167 |
+
) from e
|
| 168 |
+
session_secret = (constants.OAUTH_CLIENT_SECRET or "") + "-v1"
|
| 169 |
+
app.add_middleware(
|
| 170 |
+
SessionMiddleware, # type: ignore[arg-type]
|
| 171 |
+
secret_key=hashlib.sha256(session_secret.encode()).hexdigest(),
|
| 172 |
+
same_site="none",
|
| 173 |
+
https_only=True,
|
| 174 |
+
) # type: ignore
|
| 175 |
+
|
| 176 |
+
# Add OAuth endpoints to the FastAPI app:
|
| 177 |
+
# - {route_prefix}/oauth/huggingface/login
|
| 178 |
+
# - {route_prefix}/oauth/huggingface/callback
|
| 179 |
+
# - {route_prefix}/oauth/huggingface/logout
|
| 180 |
+
# If the app is running in a Space, OAuth is enabled normally.
|
| 181 |
+
# Otherwise, we mock the endpoints to make the user log in with a fake user profile - without any calls to hf.co.
|
| 182 |
+
route_prefix = route_prefix.strip("/")
|
| 183 |
+
if os.getenv("SPACE_ID") is not None:
|
| 184 |
+
logger.info("OAuth is enabled in the Space. Adding OAuth routes.")
|
| 185 |
+
_add_oauth_routes(app, route_prefix=route_prefix)
|
| 186 |
+
else:
|
| 187 |
+
logger.info("App is not running in a Space. Adding mocked OAuth routes.")
|
| 188 |
+
_add_mocked_oauth_routes(app, route_prefix=route_prefix)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def parse_huggingface_oauth(request: "fastapi.Request") -> Optional[OAuthInfo]:
|
| 192 |
+
"""
|
| 193 |
+
Returns the information from a logged in user as a [`OAuthInfo`] object.
|
| 194 |
+
|
| 195 |
+
For flexibility and future-proofing, this method is very lax in its parsing and does not raise errors.
|
| 196 |
+
Missing fields are set to `None` without a warning.
|
| 197 |
+
|
| 198 |
+
Return `None`, if the user is not logged in (no info in session cookie).
|
| 199 |
+
|
| 200 |
+
See [`attach_huggingface_oauth`] for an example on how to use this method.
|
| 201 |
+
"""
|
| 202 |
+
if "oauth_info" not in request.session:
|
| 203 |
+
logger.debug("No OAuth info in session.")
|
| 204 |
+
return None
|
| 205 |
+
|
| 206 |
+
logger.debug("Parsing OAuth info from session.")
|
| 207 |
+
oauth_data = request.session["oauth_info"]
|
| 208 |
+
user_data = oauth_data.get("userinfo", {})
|
| 209 |
+
orgs_data = user_data.get("orgs", [])
|
| 210 |
+
|
| 211 |
+
orgs = (
|
| 212 |
+
[
|
| 213 |
+
OAuthOrgInfo(
|
| 214 |
+
sub=org.get("sub"),
|
| 215 |
+
name=org.get("name"),
|
| 216 |
+
preferred_username=org.get("preferred_username"),
|
| 217 |
+
picture=org.get("picture"),
|
| 218 |
+
is_enterprise=org.get("isEnterprise"),
|
| 219 |
+
can_pay=org.get("canPay"),
|
| 220 |
+
role_in_org=org.get("roleInOrg"),
|
| 221 |
+
security_restrictions=org.get("securityRestrictions"),
|
| 222 |
+
)
|
| 223 |
+
for org in orgs_data
|
| 224 |
+
]
|
| 225 |
+
if orgs_data
|
| 226 |
+
else None
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
user_info = OAuthUserInfo(
|
| 230 |
+
sub=user_data.get("sub"),
|
| 231 |
+
name=user_data.get("name"),
|
| 232 |
+
preferred_username=user_data.get("preferred_username"),
|
| 233 |
+
email_verified=user_data.get("email_verified"),
|
| 234 |
+
email=user_data.get("email"),
|
| 235 |
+
picture=user_data.get("picture"),
|
| 236 |
+
profile=user_data.get("profile"),
|
| 237 |
+
website=user_data.get("website"),
|
| 238 |
+
is_pro=user_data.get("isPro"),
|
| 239 |
+
can_pay=user_data.get("canPay"),
|
| 240 |
+
orgs=orgs,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
return OAuthInfo(
|
| 244 |
+
access_token=oauth_data.get("access_token"),
|
| 245 |
+
access_token_expires_at=datetime.datetime.fromtimestamp(oauth_data.get("expires_at")),
|
| 246 |
+
user_info=user_info,
|
| 247 |
+
state=oauth_data.get("state"),
|
| 248 |
+
scope=oauth_data.get("scope"),
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _add_oauth_routes(app: "fastapi.FastAPI", route_prefix: str) -> None:
|
| 253 |
+
"""Add OAuth routes to the FastAPI app (login, callback handler and logout)."""
|
| 254 |
+
try:
|
| 255 |
+
import fastapi
|
| 256 |
+
from authlib.integrations.base_client.errors import MismatchingStateError
|
| 257 |
+
from authlib.integrations.starlette_client import OAuth
|
| 258 |
+
from fastapi.responses import RedirectResponse
|
| 259 |
+
except ImportError as e:
|
| 260 |
+
raise ImportError(
|
| 261 |
+
"Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
|
| 262 |
+
"`huggingface_hub[oauth]` to your requirements.txt file."
|
| 263 |
+
) from e
|
| 264 |
+
|
| 265 |
+
# Check environment variables
|
| 266 |
+
msg = (
|
| 267 |
+
"OAuth is required but '{}' environment variable is not set. Make sure you've enabled OAuth in your Space by"
|
| 268 |
+
" setting `hf_oauth: true` in the Space metadata."
|
| 269 |
+
)
|
| 270 |
+
if constants.OAUTH_CLIENT_ID is None:
|
| 271 |
+
raise ValueError(msg.format("OAUTH_CLIENT_ID"))
|
| 272 |
+
if constants.OAUTH_CLIENT_SECRET is None:
|
| 273 |
+
raise ValueError(msg.format("OAUTH_CLIENT_SECRET"))
|
| 274 |
+
if constants.OAUTH_SCOPES is None:
|
| 275 |
+
raise ValueError(msg.format("OAUTH_SCOPES"))
|
| 276 |
+
if constants.OPENID_PROVIDER_URL is None:
|
| 277 |
+
raise ValueError(msg.format("OPENID_PROVIDER_URL"))
|
| 278 |
+
|
| 279 |
+
# Register OAuth server
|
| 280 |
+
oauth = OAuth()
|
| 281 |
+
oauth.register(
|
| 282 |
+
name="huggingface",
|
| 283 |
+
client_id=constants.OAUTH_CLIENT_ID,
|
| 284 |
+
client_secret=constants.OAUTH_CLIENT_SECRET,
|
| 285 |
+
client_kwargs={"scope": constants.OAUTH_SCOPES},
|
| 286 |
+
server_metadata_url=constants.OPENID_PROVIDER_URL + "/.well-known/openid-configuration",
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
login_uri, callback_uri, logout_uri = _get_oauth_uris(route_prefix)
|
| 290 |
+
|
| 291 |
+
# Register OAuth endpoints
|
| 292 |
+
@app.get(login_uri)
|
| 293 |
+
async def oauth_login(request: fastapi.Request) -> RedirectResponse:
|
| 294 |
+
"""Endpoint that redirects to HF OAuth page."""
|
| 295 |
+
redirect_uri = _generate_redirect_uri(request)
|
| 296 |
+
return await oauth.huggingface.authorize_redirect(request, redirect_uri) # type: ignore
|
| 297 |
+
|
| 298 |
+
@app.get(callback_uri)
|
| 299 |
+
async def oauth_redirect_callback(request: fastapi.Request) -> RedirectResponse:
|
| 300 |
+
"""Endpoint that handles the OAuth callback."""
|
| 301 |
+
try:
|
| 302 |
+
oauth_info = await oauth.huggingface.authorize_access_token(request) # type: ignore
|
| 303 |
+
except MismatchingStateError:
|
| 304 |
+
# Parse query params
|
| 305 |
+
nb_redirects = int(request.query_params.get("_nb_redirects", 0))
|
| 306 |
+
target_url = request.query_params.get("_target_url")
|
| 307 |
+
|
| 308 |
+
# Build redirect URI with the same query params as before and bump nb_redirects count
|
| 309 |
+
query_params: Dict[str, Union[int, str]] = {"_nb_redirects": nb_redirects + 1}
|
| 310 |
+
if target_url:
|
| 311 |
+
query_params["_target_url"] = target_url
|
| 312 |
+
|
| 313 |
+
redirect_uri = f"{login_uri}?{urllib.parse.urlencode(query_params)}"
|
| 314 |
+
|
| 315 |
+
# If the user is redirected more than 3 times, it is very likely that the cookie is not working properly.
|
| 316 |
+
# (e.g. browser is blocking third-party cookies in iframe). In this case, redirect the user in the
|
| 317 |
+
# non-iframe view.
|
| 318 |
+
if nb_redirects > constants.OAUTH_MAX_REDIRECTS:
|
| 319 |
+
host = os.environ.get("SPACE_HOST")
|
| 320 |
+
if host is None: # cannot happen in a Space
|
| 321 |
+
raise RuntimeError(
|
| 322 |
+
"App is not running in a Space (SPACE_HOST environment variable is not set). Cannot redirect to non-iframe view."
|
| 323 |
+
) from None
|
| 324 |
+
host_url = "https://" + host.rstrip("/")
|
| 325 |
+
return RedirectResponse(host_url + redirect_uri)
|
| 326 |
+
|
| 327 |
+
# Redirect the user to the login page again
|
| 328 |
+
return RedirectResponse(redirect_uri)
|
| 329 |
+
|
| 330 |
+
# OAuth login worked => store the user info in the session and redirect
|
| 331 |
+
logger.debug("Successfully logged in with OAuth. Storing user info in session.")
|
| 332 |
+
request.session["oauth_info"] = oauth_info
|
| 333 |
+
return RedirectResponse(_get_redirect_target(request))
|
| 334 |
+
|
| 335 |
+
@app.get(logout_uri)
|
| 336 |
+
async def oauth_logout(request: fastapi.Request) -> RedirectResponse:
|
| 337 |
+
"""Endpoint that logs out the user (e.g. delete info from cookie session)."""
|
| 338 |
+
logger.debug("Logged out with OAuth. Removing user info from session.")
|
| 339 |
+
request.session.pop("oauth_info", None)
|
| 340 |
+
return RedirectResponse(_get_redirect_target(request))
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def _add_mocked_oauth_routes(app: "fastapi.FastAPI", route_prefix: str = "/") -> None:
|
| 344 |
+
"""Add fake oauth routes if app is run locally and OAuth is enabled.
|
| 345 |
+
|
| 346 |
+
Using OAuth will have the same behavior as in a Space but instead of authenticating with HF, a mocked user profile
|
| 347 |
+
is added to the session.
|
| 348 |
+
"""
|
| 349 |
+
try:
|
| 350 |
+
import fastapi
|
| 351 |
+
from fastapi.responses import RedirectResponse
|
| 352 |
+
from starlette.datastructures import URL
|
| 353 |
+
except ImportError as e:
|
| 354 |
+
raise ImportError(
|
| 355 |
+
"Cannot initialize OAuth to due a missing library. Please run `pip install huggingface_hub[oauth]` or add "
|
| 356 |
+
"`huggingface_hub[oauth]` to your requirements.txt file."
|
| 357 |
+
) from e
|
| 358 |
+
|
| 359 |
+
warnings.warn(
|
| 360 |
+
"OAuth is not supported outside of a Space environment. To help you debug your app locally, the oauth endpoints"
|
| 361 |
+
" are mocked to return your profile and token. To make it work, your machine must be logged in to Huggingface."
|
| 362 |
+
)
|
| 363 |
+
mocked_oauth_info = _get_mocked_oauth_info()
|
| 364 |
+
|
| 365 |
+
login_uri, callback_uri, logout_uri = _get_oauth_uris(route_prefix)
|
| 366 |
+
|
| 367 |
+
# Define OAuth routes
|
| 368 |
+
@app.get(login_uri)
|
| 369 |
+
async def oauth_login(request: fastapi.Request) -> RedirectResponse:
|
| 370 |
+
"""Fake endpoint that redirects to HF OAuth page."""
|
| 371 |
+
# Define target (where to redirect after login)
|
| 372 |
+
redirect_uri = _generate_redirect_uri(request)
|
| 373 |
+
return RedirectResponse(callback_uri + "?" + urllib.parse.urlencode({"_target_url": redirect_uri}))
|
| 374 |
+
|
| 375 |
+
@app.get(callback_uri)
|
| 376 |
+
async def oauth_redirect_callback(request: fastapi.Request) -> RedirectResponse:
|
| 377 |
+
"""Endpoint that handles the OAuth callback."""
|
| 378 |
+
request.session["oauth_info"] = mocked_oauth_info
|
| 379 |
+
return RedirectResponse(_get_redirect_target(request))
|
| 380 |
+
|
| 381 |
+
@app.get(logout_uri)
|
| 382 |
+
async def oauth_logout(request: fastapi.Request) -> RedirectResponse:
|
| 383 |
+
"""Endpoint that logs out the user (e.g. delete cookie session)."""
|
| 384 |
+
request.session.pop("oauth_info", None)
|
| 385 |
+
logout_url = URL("/").include_query_params(**request.query_params)
|
| 386 |
+
return RedirectResponse(url=logout_url, status_code=302) # see https://github.com/gradio-app/gradio/pull/9659
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def _generate_redirect_uri(request: "fastapi.Request") -> str:
|
| 390 |
+
if "_target_url" in request.query_params:
|
| 391 |
+
# if `_target_url` already in query params => respect it
|
| 392 |
+
target = request.query_params["_target_url"]
|
| 393 |
+
else:
|
| 394 |
+
# otherwise => keep query params
|
| 395 |
+
target = "/?" + urllib.parse.urlencode(request.query_params)
|
| 396 |
+
|
| 397 |
+
redirect_uri = request.url_for("oauth_redirect_callback").include_query_params(_target_url=target)
|
| 398 |
+
redirect_uri_as_str = str(redirect_uri)
|
| 399 |
+
if redirect_uri.netloc.endswith(".hf.space"):
|
| 400 |
+
# In Space, FastAPI redirect as http but we want https
|
| 401 |
+
redirect_uri_as_str = redirect_uri_as_str.replace("http://", "https://")
|
| 402 |
+
return redirect_uri_as_str
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def _get_redirect_target(request: "fastapi.Request", default_target: str = "/") -> str:
|
| 406 |
+
return request.query_params.get("_target_url", default_target)
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
def _get_mocked_oauth_info() -> Dict:
|
| 410 |
+
token = get_token()
|
| 411 |
+
if token is None:
|
| 412 |
+
raise ValueError(
|
| 413 |
+
"Your machine must be logged in to HF to debug an OAuth app locally. Please"
|
| 414 |
+
" run `hf auth login` or set `HF_TOKEN` as environment variable "
|
| 415 |
+
"with one of your access token. You can generate a new token in your "
|
| 416 |
+
"settings page (https://huggingface.co/settings/tokens)."
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
user = whoami()
|
| 420 |
+
if user["type"] != "user":
|
| 421 |
+
raise ValueError(
|
| 422 |
+
"Your machine is not logged in with a personal account. Please use a "
|
| 423 |
+
"personal access token. You can generate a new token in your settings page"
|
| 424 |
+
" (https://huggingface.co/settings/tokens)."
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
return {
|
| 428 |
+
"access_token": token,
|
| 429 |
+
"token_type": "bearer",
|
| 430 |
+
"expires_in": 8 * 60 * 60, # 8 hours
|
| 431 |
+
"id_token": "FOOBAR",
|
| 432 |
+
"scope": "openid profile",
|
| 433 |
+
"refresh_token": "hf_oauth__refresh_token",
|
| 434 |
+
"expires_at": int(time.time()) + 8 * 60 * 60, # 8 hours
|
| 435 |
+
"userinfo": {
|
| 436 |
+
"sub": "0123456789",
|
| 437 |
+
"name": user["fullname"],
|
| 438 |
+
"preferred_username": user["name"],
|
| 439 |
+
"profile": f"https://huggingface.co/{user['name']}",
|
| 440 |
+
"picture": user["avatarUrl"],
|
| 441 |
+
"website": "",
|
| 442 |
+
"aud": "00000000-0000-0000-0000-000000000000",
|
| 443 |
+
"auth_time": 1691672844,
|
| 444 |
+
"nonce": "aaaaaaaaaaaaaaaaaaa",
|
| 445 |
+
"iat": 1691672844,
|
| 446 |
+
"exp": 1691676444,
|
| 447 |
+
"iss": "https://huggingface.co",
|
| 448 |
+
},
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def _get_oauth_uris(route_prefix: str = "/") -> Tuple[str, str, str]:
|
| 453 |
+
route_prefix = route_prefix.strip("/")
|
| 454 |
+
if route_prefix:
|
| 455 |
+
route_prefix = f"/{route_prefix}"
|
| 456 |
+
return (
|
| 457 |
+
f"{route_prefix}/oauth/huggingface/login",
|
| 458 |
+
f"{route_prefix}/oauth/huggingface/callback",
|
| 459 |
+
f"{route_prefix}/oauth/huggingface/logout",
|
| 460 |
+
)
|
venv/lib/python3.13/site-packages/huggingface_hub/_snapshot_download.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, Iterable, List, Literal, Optional, Type, Union
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from tqdm.auto import tqdm as base_tqdm
|
| 7 |
+
from tqdm.contrib.concurrent import thread_map
|
| 8 |
+
|
| 9 |
+
from . import constants
|
| 10 |
+
from .errors import (
|
| 11 |
+
GatedRepoError,
|
| 12 |
+
HfHubHTTPError,
|
| 13 |
+
LocalEntryNotFoundError,
|
| 14 |
+
RepositoryNotFoundError,
|
| 15 |
+
RevisionNotFoundError,
|
| 16 |
+
)
|
| 17 |
+
from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
|
| 18 |
+
from .hf_api import DatasetInfo, HfApi, ModelInfo, RepoFile, SpaceInfo
|
| 19 |
+
from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
|
| 20 |
+
from .utils import tqdm as hf_tqdm
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
logger = logging.get_logger(__name__)
|
| 24 |
+
|
| 25 |
+
VERY_LARGE_REPO_THRESHOLD = 50000 # After this limit, we don't consider `repo_info.siblings` to be reliable enough
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@validate_hf_hub_args
|
| 29 |
+
def snapshot_download(
|
| 30 |
+
repo_id: str,
|
| 31 |
+
*,
|
| 32 |
+
repo_type: Optional[str] = None,
|
| 33 |
+
revision: Optional[str] = None,
|
| 34 |
+
cache_dir: Union[str, Path, None] = None,
|
| 35 |
+
local_dir: Union[str, Path, None] = None,
|
| 36 |
+
library_name: Optional[str] = None,
|
| 37 |
+
library_version: Optional[str] = None,
|
| 38 |
+
user_agent: Optional[Union[Dict, str]] = None,
|
| 39 |
+
proxies: Optional[Dict] = None,
|
| 40 |
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
| 41 |
+
force_download: bool = False,
|
| 42 |
+
token: Optional[Union[bool, str]] = None,
|
| 43 |
+
local_files_only: bool = False,
|
| 44 |
+
allow_patterns: Optional[Union[List[str], str]] = None,
|
| 45 |
+
ignore_patterns: Optional[Union[List[str], str]] = None,
|
| 46 |
+
max_workers: int = 8,
|
| 47 |
+
tqdm_class: Optional[Type[base_tqdm]] = None,
|
| 48 |
+
headers: Optional[Dict[str, str]] = None,
|
| 49 |
+
endpoint: Optional[str] = None,
|
| 50 |
+
# Deprecated args
|
| 51 |
+
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
| 52 |
+
resume_download: Optional[bool] = None,
|
| 53 |
+
) -> str:
|
| 54 |
+
"""Download repo files.
|
| 55 |
+
|
| 56 |
+
Download a whole snapshot of a repo's files at the specified revision. This is useful when you want all files from
|
| 57 |
+
a repo, because you don't know which ones you will need a priori. All files are nested inside a folder in order
|
| 58 |
+
to keep their actual filename relative to that folder. You can also filter which files to download using
|
| 59 |
+
`allow_patterns` and `ignore_patterns`.
|
| 60 |
+
|
| 61 |
+
If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
|
| 62 |
+
option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
|
| 63 |
+
to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
|
| 64 |
+
cache-system, it's optimized for regularly pulling the latest version of a repository.
|
| 65 |
+
|
| 66 |
+
An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly
|
| 67 |
+
configured. It is also not possible to filter which files to download when cloning a repository using git.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
repo_id (`str`):
|
| 71 |
+
A user or an organization name and a repo name separated by a `/`.
|
| 72 |
+
repo_type (`str`, *optional*):
|
| 73 |
+
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
| 74 |
+
`None` or `"model"` if downloading from a model. Default is `None`.
|
| 75 |
+
revision (`str`, *optional*):
|
| 76 |
+
An optional Git revision id which can be a branch name, a tag, or a
|
| 77 |
+
commit hash.
|
| 78 |
+
cache_dir (`str`, `Path`, *optional*):
|
| 79 |
+
Path to the folder where cached files are stored.
|
| 80 |
+
local_dir (`str` or `Path`, *optional*):
|
| 81 |
+
If provided, the downloaded files will be placed under this directory.
|
| 82 |
+
library_name (`str`, *optional*):
|
| 83 |
+
The name of the library to which the object corresponds.
|
| 84 |
+
library_version (`str`, *optional*):
|
| 85 |
+
The version of the library.
|
| 86 |
+
user_agent (`str`, `dict`, *optional*):
|
| 87 |
+
The user-agent info in the form of a dictionary or a string.
|
| 88 |
+
proxies (`dict`, *optional*):
|
| 89 |
+
Dictionary mapping protocol to the URL of the proxy passed to
|
| 90 |
+
`requests.request`.
|
| 91 |
+
etag_timeout (`float`, *optional*, defaults to `10`):
|
| 92 |
+
When fetching ETag, how many seconds to wait for the server to send
|
| 93 |
+
data before giving up which is passed to `requests.request`.
|
| 94 |
+
force_download (`bool`, *optional*, defaults to `False`):
|
| 95 |
+
Whether the file should be downloaded even if it already exists in the local cache.
|
| 96 |
+
token (`str`, `bool`, *optional*):
|
| 97 |
+
A token to be used for the download.
|
| 98 |
+
- If `True`, the token is read from the HuggingFace config
|
| 99 |
+
folder.
|
| 100 |
+
- If a string, it's used as the authentication token.
|
| 101 |
+
headers (`dict`, *optional*):
|
| 102 |
+
Additional headers to include in the request. Those headers take precedence over the others.
|
| 103 |
+
local_files_only (`bool`, *optional*, defaults to `False`):
|
| 104 |
+
If `True`, avoid downloading the file and return the path to the
|
| 105 |
+
local cached file if it exists.
|
| 106 |
+
allow_patterns (`List[str]` or `str`, *optional*):
|
| 107 |
+
If provided, only files matching at least one pattern are downloaded.
|
| 108 |
+
ignore_patterns (`List[str]` or `str`, *optional*):
|
| 109 |
+
If provided, files matching any of the patterns are not downloaded.
|
| 110 |
+
max_workers (`int`, *optional*):
|
| 111 |
+
Number of concurrent threads to download files (1 thread = 1 file download).
|
| 112 |
+
Defaults to 8.
|
| 113 |
+
tqdm_class (`tqdm`, *optional*):
|
| 114 |
+
If provided, overwrites the default behavior for the progress bar. Passed
|
| 115 |
+
argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
|
| 116 |
+
Note that the `tqdm_class` is not passed to each individual download.
|
| 117 |
+
Defaults to the custom HF progress bar that can be disabled by setting
|
| 118 |
+
`HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
`str`: folder path of the repo snapshot.
|
| 122 |
+
|
| 123 |
+
Raises:
|
| 124 |
+
[`~utils.RepositoryNotFoundError`]
|
| 125 |
+
If the repository to download from cannot be found. This may be because it doesn't exist,
|
| 126 |
+
or because it is set to `private` and you do not have access.
|
| 127 |
+
[`~utils.RevisionNotFoundError`]
|
| 128 |
+
If the revision to download from cannot be found.
|
| 129 |
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
| 130 |
+
If `token=True` and the token cannot be found.
|
| 131 |
+
[`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
|
| 132 |
+
ETag cannot be determined.
|
| 133 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 134 |
+
if some parameter value is invalid.
|
| 135 |
+
"""
|
| 136 |
+
if cache_dir is None:
|
| 137 |
+
cache_dir = constants.HF_HUB_CACHE
|
| 138 |
+
if revision is None:
|
| 139 |
+
revision = constants.DEFAULT_REVISION
|
| 140 |
+
if isinstance(cache_dir, Path):
|
| 141 |
+
cache_dir = str(cache_dir)
|
| 142 |
+
|
| 143 |
+
if repo_type is None:
|
| 144 |
+
repo_type = "model"
|
| 145 |
+
if repo_type not in constants.REPO_TYPES:
|
| 146 |
+
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
|
| 147 |
+
|
| 148 |
+
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
| 149 |
+
|
| 150 |
+
api = HfApi(
|
| 151 |
+
library_name=library_name,
|
| 152 |
+
library_version=library_version,
|
| 153 |
+
user_agent=user_agent,
|
| 154 |
+
endpoint=endpoint,
|
| 155 |
+
headers=headers,
|
| 156 |
+
token=token,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
repo_info: Union[ModelInfo, DatasetInfo, SpaceInfo, None] = None
|
| 160 |
+
api_call_error: Optional[Exception] = None
|
| 161 |
+
if not local_files_only:
|
| 162 |
+
# try/except logic to handle different errors => taken from `hf_hub_download`
|
| 163 |
+
try:
|
| 164 |
+
# if we have internet connection we want to list files to download
|
| 165 |
+
repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision)
|
| 166 |
+
except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
|
| 167 |
+
# Actually raise for those subclasses of ConnectionError
|
| 168 |
+
raise
|
| 169 |
+
except (
|
| 170 |
+
requests.exceptions.ConnectionError,
|
| 171 |
+
requests.exceptions.Timeout,
|
| 172 |
+
OfflineModeIsEnabled,
|
| 173 |
+
) as error:
|
| 174 |
+
# Internet connection is down
|
| 175 |
+
# => will try to use local files only
|
| 176 |
+
api_call_error = error
|
| 177 |
+
pass
|
| 178 |
+
except RevisionNotFoundError:
|
| 179 |
+
# The repo was found but the revision doesn't exist on the Hub (never existed or got deleted)
|
| 180 |
+
raise
|
| 181 |
+
except requests.HTTPError as error:
|
| 182 |
+
# Multiple reasons for an http error:
|
| 183 |
+
# - Repository is private and invalid/missing token sent
|
| 184 |
+
# - Repository is gated and invalid/missing token sent
|
| 185 |
+
# - Hub is down (error 500 or 504)
|
| 186 |
+
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
| 187 |
+
# (if it's not the case, the error will be re-raised)
|
| 188 |
+
api_call_error = error
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
# At this stage, if `repo_info` is None it means either:
|
| 192 |
+
# - internet connection is down
|
| 193 |
+
# - internet connection is deactivated (local_files_only=True or HF_HUB_OFFLINE=True)
|
| 194 |
+
# - repo is private/gated and invalid/missing token sent
|
| 195 |
+
# - Hub is down
|
| 196 |
+
# => let's look if we can find the appropriate folder in the cache:
|
| 197 |
+
# - if the specified revision is a commit hash, look inside "snapshots".
|
| 198 |
+
# - f the specified revision is a branch or tag, look inside "refs".
|
| 199 |
+
# => if local_dir is not None, we will return the path to the local folder if it exists.
|
| 200 |
+
if repo_info is None:
|
| 201 |
+
# Try to get which commit hash corresponds to the specified revision
|
| 202 |
+
commit_hash = None
|
| 203 |
+
if REGEX_COMMIT_HASH.match(revision):
|
| 204 |
+
commit_hash = revision
|
| 205 |
+
else:
|
| 206 |
+
ref_path = os.path.join(storage_folder, "refs", revision)
|
| 207 |
+
if os.path.exists(ref_path):
|
| 208 |
+
# retrieve commit_hash from refs file
|
| 209 |
+
with open(ref_path) as f:
|
| 210 |
+
commit_hash = f.read()
|
| 211 |
+
|
| 212 |
+
# Try to locate snapshot folder for this commit hash
|
| 213 |
+
if commit_hash is not None and local_dir is None:
|
| 214 |
+
snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
|
| 215 |
+
if os.path.exists(snapshot_folder):
|
| 216 |
+
# Snapshot folder exists => let's return it
|
| 217 |
+
# (but we can't check if all the files are actually there)
|
| 218 |
+
return snapshot_folder
|
| 219 |
+
|
| 220 |
+
# If local_dir is not None, return it if it exists and is not empty
|
| 221 |
+
if local_dir is not None:
|
| 222 |
+
local_dir = Path(local_dir)
|
| 223 |
+
if local_dir.is_dir() and any(local_dir.iterdir()):
|
| 224 |
+
logger.warning(
|
| 225 |
+
f"Returning existing local_dir `{local_dir}` as remote repo cannot be accessed in `snapshot_download` ({api_call_error})."
|
| 226 |
+
)
|
| 227 |
+
return str(local_dir.resolve())
|
| 228 |
+
# If we couldn't find the appropriate folder on disk, raise an error.
|
| 229 |
+
if local_files_only:
|
| 230 |
+
raise LocalEntryNotFoundError(
|
| 231 |
+
"Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
|
| 232 |
+
"outgoing traffic has been disabled. To enable repo look-ups and downloads online, pass "
|
| 233 |
+
"'local_files_only=False' as input."
|
| 234 |
+
)
|
| 235 |
+
elif isinstance(api_call_error, OfflineModeIsEnabled):
|
| 236 |
+
raise LocalEntryNotFoundError(
|
| 237 |
+
"Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
|
| 238 |
+
"outgoing traffic has been disabled. To enable repo look-ups and downloads online, set "
|
| 239 |
+
"'HF_HUB_OFFLINE=0' as environment variable."
|
| 240 |
+
) from api_call_error
|
| 241 |
+
elif isinstance(api_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
|
| 242 |
+
isinstance(api_call_error, HfHubHTTPError) and api_call_error.response.status_code == 401
|
| 243 |
+
):
|
| 244 |
+
# Repo not found, gated, or specific authentication error => let's raise the actual error
|
| 245 |
+
raise api_call_error
|
| 246 |
+
else:
|
| 247 |
+
# Otherwise: most likely a connection issue or Hub downtime => let's warn the user
|
| 248 |
+
raise LocalEntryNotFoundError(
|
| 249 |
+
"An error happened while trying to locate the files on the Hub and we cannot find the appropriate"
|
| 250 |
+
" snapshot folder for the specified revision on the local disk. Please check your internet connection"
|
| 251 |
+
" and try again."
|
| 252 |
+
) from api_call_error
|
| 253 |
+
|
| 254 |
+
# At this stage, internet connection is up and running
|
| 255 |
+
# => let's download the files!
|
| 256 |
+
assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
|
| 257 |
+
|
| 258 |
+
# Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
|
| 259 |
+
# In that case, we need to use the `list_repo_tree` method to prevent caching issues.
|
| 260 |
+
repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] if repo_info.siblings is not None else []
|
| 261 |
+
unreliable_nb_files = (
|
| 262 |
+
repo_info.siblings is None
|
| 263 |
+
or len(repo_info.siblings) == 0
|
| 264 |
+
or len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD
|
| 265 |
+
)
|
| 266 |
+
if unreliable_nb_files:
|
| 267 |
+
logger.info(
|
| 268 |
+
"Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed."
|
| 269 |
+
)
|
| 270 |
+
repo_files = (
|
| 271 |
+
f.rfilename
|
| 272 |
+
for f in api.list_repo_tree(repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type)
|
| 273 |
+
if isinstance(f, RepoFile)
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
filtered_repo_files: Iterable[str] = filter_repo_objects(
|
| 277 |
+
items=repo_files,
|
| 278 |
+
allow_patterns=allow_patterns,
|
| 279 |
+
ignore_patterns=ignore_patterns,
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
if not unreliable_nb_files:
|
| 283 |
+
filtered_repo_files = list(filtered_repo_files)
|
| 284 |
+
tqdm_desc = f"Fetching {len(filtered_repo_files)} files"
|
| 285 |
+
else:
|
| 286 |
+
tqdm_desc = "Fetching ... files"
|
| 287 |
+
|
| 288 |
+
commit_hash = repo_info.sha
|
| 289 |
+
snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
|
| 290 |
+
# if passed revision is not identical to commit_hash
|
| 291 |
+
# then revision has to be a branch name or tag name.
|
| 292 |
+
# In that case store a ref.
|
| 293 |
+
if revision != commit_hash:
|
| 294 |
+
ref_path = os.path.join(storage_folder, "refs", revision)
|
| 295 |
+
try:
|
| 296 |
+
os.makedirs(os.path.dirname(ref_path), exist_ok=True)
|
| 297 |
+
with open(ref_path, "w") as f:
|
| 298 |
+
f.write(commit_hash)
|
| 299 |
+
except OSError as e:
|
| 300 |
+
logger.warning(f"Ignored error while writing commit hash to {ref_path}: {e}.")
|
| 301 |
+
|
| 302 |
+
# we pass the commit_hash to hf_hub_download
|
| 303 |
+
# so no network call happens if we already
|
| 304 |
+
# have the file locally.
|
| 305 |
+
def _inner_hf_hub_download(repo_file: str):
|
| 306 |
+
return hf_hub_download(
|
| 307 |
+
repo_id,
|
| 308 |
+
filename=repo_file,
|
| 309 |
+
repo_type=repo_type,
|
| 310 |
+
revision=commit_hash,
|
| 311 |
+
endpoint=endpoint,
|
| 312 |
+
cache_dir=cache_dir,
|
| 313 |
+
local_dir=local_dir,
|
| 314 |
+
local_dir_use_symlinks=local_dir_use_symlinks,
|
| 315 |
+
library_name=library_name,
|
| 316 |
+
library_version=library_version,
|
| 317 |
+
user_agent=user_agent,
|
| 318 |
+
proxies=proxies,
|
| 319 |
+
etag_timeout=etag_timeout,
|
| 320 |
+
resume_download=resume_download,
|
| 321 |
+
force_download=force_download,
|
| 322 |
+
token=token,
|
| 323 |
+
headers=headers,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
| 327 |
+
# when using hf_transfer we don't want extra parallelism
|
| 328 |
+
# from the one hf_transfer provides
|
| 329 |
+
for file in filtered_repo_files:
|
| 330 |
+
_inner_hf_hub_download(file)
|
| 331 |
+
else:
|
| 332 |
+
thread_map(
|
| 333 |
+
_inner_hf_hub_download,
|
| 334 |
+
filtered_repo_files,
|
| 335 |
+
desc=tqdm_desc,
|
| 336 |
+
max_workers=max_workers,
|
| 337 |
+
# User can use its own tqdm class or the default one from `huggingface_hub.utils`
|
| 338 |
+
tqdm_class=tqdm_class or hf_tqdm,
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
if local_dir is not None:
|
| 342 |
+
return str(os.path.realpath(local_dir))
|
| 343 |
+
return snapshot_folder
|
venv/lib/python3.13/site-packages/huggingface_hub/_space_api.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2019-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
from enum import Enum
|
| 18 |
+
from typing import Dict, Optional
|
| 19 |
+
|
| 20 |
+
from huggingface_hub.utils import parse_datetime
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class SpaceStage(str, Enum):
|
| 24 |
+
"""
|
| 25 |
+
Enumeration of possible stage of a Space on the Hub.
|
| 26 |
+
|
| 27 |
+
Value can be compared to a string:
|
| 28 |
+
```py
|
| 29 |
+
assert SpaceStage.BUILDING == "BUILDING"
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceInfo.ts#L61 (private url).
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
# Copied from moon-landing > server > repo_types > SpaceInfo.ts (private repo)
|
| 36 |
+
NO_APP_FILE = "NO_APP_FILE"
|
| 37 |
+
CONFIG_ERROR = "CONFIG_ERROR"
|
| 38 |
+
BUILDING = "BUILDING"
|
| 39 |
+
BUILD_ERROR = "BUILD_ERROR"
|
| 40 |
+
RUNNING = "RUNNING"
|
| 41 |
+
RUNNING_BUILDING = "RUNNING_BUILDING"
|
| 42 |
+
RUNTIME_ERROR = "RUNTIME_ERROR"
|
| 43 |
+
DELETING = "DELETING"
|
| 44 |
+
STOPPED = "STOPPED"
|
| 45 |
+
PAUSED = "PAUSED"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class SpaceHardware(str, Enum):
|
| 49 |
+
"""
|
| 50 |
+
Enumeration of hardwares available to run your Space on the Hub.
|
| 51 |
+
|
| 52 |
+
Value can be compared to a string:
|
| 53 |
+
```py
|
| 54 |
+
assert SpaceHardware.CPU_BASIC == "cpu-basic"
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Taken from https://github.com/huggingface-internal/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts (private url).
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
# CPU
|
| 61 |
+
CPU_BASIC = "cpu-basic"
|
| 62 |
+
CPU_UPGRADE = "cpu-upgrade"
|
| 63 |
+
CPU_XL = "cpu-xl"
|
| 64 |
+
|
| 65 |
+
# ZeroGPU
|
| 66 |
+
ZERO_A10G = "zero-a10g"
|
| 67 |
+
|
| 68 |
+
# GPU
|
| 69 |
+
T4_SMALL = "t4-small"
|
| 70 |
+
T4_MEDIUM = "t4-medium"
|
| 71 |
+
L4X1 = "l4x1"
|
| 72 |
+
L4X4 = "l4x4"
|
| 73 |
+
L40SX1 = "l40sx1"
|
| 74 |
+
L40SX4 = "l40sx4"
|
| 75 |
+
L40SX8 = "l40sx8"
|
| 76 |
+
A10G_SMALL = "a10g-small"
|
| 77 |
+
A10G_LARGE = "a10g-large"
|
| 78 |
+
A10G_LARGEX2 = "a10g-largex2"
|
| 79 |
+
A10G_LARGEX4 = "a10g-largex4"
|
| 80 |
+
A100_LARGE = "a100-large"
|
| 81 |
+
H100 = "h100"
|
| 82 |
+
H100X8 = "h100x8"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class SpaceStorage(str, Enum):
|
| 86 |
+
"""
|
| 87 |
+
Enumeration of persistent storage available for your Space on the Hub.
|
| 88 |
+
|
| 89 |
+
Value can be compared to a string:
|
| 90 |
+
```py
|
| 91 |
+
assert SpaceStorage.SMALL == "small"
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts#L24 (private url).
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
SMALL = "small"
|
| 98 |
+
MEDIUM = "medium"
|
| 99 |
+
LARGE = "large"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@dataclass
|
| 103 |
+
class SpaceRuntime:
|
| 104 |
+
"""
|
| 105 |
+
Contains information about the current runtime of a Space.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
stage (`str`):
|
| 109 |
+
Current stage of the space. Example: RUNNING.
|
| 110 |
+
hardware (`str` or `None`):
|
| 111 |
+
Current hardware of the space. Example: "cpu-basic". Can be `None` if Space
|
| 112 |
+
is `BUILDING` for the first time.
|
| 113 |
+
requested_hardware (`str` or `None`):
|
| 114 |
+
Requested hardware. Can be different than `hardware` especially if the request
|
| 115 |
+
has just been made. Example: "t4-medium". Can be `None` if no hardware has
|
| 116 |
+
been requested yet.
|
| 117 |
+
sleep_time (`int` or `None`):
|
| 118 |
+
Number of seconds the Space will be kept alive after the last request. By default (if value is `None`), the
|
| 119 |
+
Space will never go to sleep if it's running on an upgraded hardware, while it will go to sleep after 48
|
| 120 |
+
hours on a free 'cpu-basic' hardware. For more details, see https://huggingface.co/docs/hub/spaces-gpus#sleep-time.
|
| 121 |
+
raw (`dict`):
|
| 122 |
+
Raw response from the server. Contains more information about the Space
|
| 123 |
+
runtime like number of replicas, number of cpu, memory size,...
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
stage: SpaceStage
|
| 127 |
+
hardware: Optional[SpaceHardware]
|
| 128 |
+
requested_hardware: Optional[SpaceHardware]
|
| 129 |
+
sleep_time: Optional[int]
|
| 130 |
+
storage: Optional[SpaceStorage]
|
| 131 |
+
raw: Dict
|
| 132 |
+
|
| 133 |
+
def __init__(self, data: Dict) -> None:
|
| 134 |
+
self.stage = data["stage"]
|
| 135 |
+
self.hardware = data.get("hardware", {}).get("current")
|
| 136 |
+
self.requested_hardware = data.get("hardware", {}).get("requested")
|
| 137 |
+
self.sleep_time = data.get("gcTimeout")
|
| 138 |
+
self.storage = data.get("storage")
|
| 139 |
+
self.raw = data
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@dataclass
|
| 143 |
+
class SpaceVariable:
|
| 144 |
+
"""
|
| 145 |
+
Contains information about the current variables of a Space.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
key (`str`):
|
| 149 |
+
Variable key. Example: `"MODEL_REPO_ID"`
|
| 150 |
+
value (`str`):
|
| 151 |
+
Variable value. Example: `"the_model_repo_id"`.
|
| 152 |
+
description (`str` or None):
|
| 153 |
+
Description of the variable. Example: `"Model Repo ID of the implemented model"`.
|
| 154 |
+
updatedAt (`datetime` or None):
|
| 155 |
+
datetime of the last update of the variable (if the variable has been updated at least once).
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
key: str
|
| 159 |
+
value: str
|
| 160 |
+
description: Optional[str]
|
| 161 |
+
updated_at: Optional[datetime]
|
| 162 |
+
|
| 163 |
+
def __init__(self, key: str, values: Dict) -> None:
|
| 164 |
+
self.key = key
|
| 165 |
+
self.value = values["value"]
|
| 166 |
+
self.description = values.get("description")
|
| 167 |
+
updated_at = values.get("updatedAt")
|
| 168 |
+
self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
|
venv/lib/python3.13/site-packages/huggingface_hub/_tensorboard_logger.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""Contains a logger to push training logs to the Hub, using Tensorboard."""
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import List, Optional, Union
|
| 18 |
+
|
| 19 |
+
from ._commit_scheduler import CommitScheduler
|
| 20 |
+
from .errors import EntryNotFoundError
|
| 21 |
+
from .repocard import ModelCard
|
| 22 |
+
from .utils import experimental
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Depending on user's setup, SummaryWriter can come either from 'tensorboardX'
|
| 26 |
+
# or from 'torch.utils.tensorboard'. Both are compatible so let's try to load
|
| 27 |
+
# from either of them.
|
| 28 |
+
try:
|
| 29 |
+
from tensorboardX import SummaryWriter as _RuntimeSummaryWriter
|
| 30 |
+
|
| 31 |
+
is_summary_writer_available = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
try:
|
| 34 |
+
from torch.utils.tensorboard import SummaryWriter as _RuntimeSummaryWriter
|
| 35 |
+
|
| 36 |
+
is_summary_writer_available = True
|
| 37 |
+
except ImportError:
|
| 38 |
+
# Dummy class to avoid failing at import. Will raise on instance creation.
|
| 39 |
+
class _DummySummaryWriter:
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
_RuntimeSummaryWriter = _DummySummaryWriter # type: ignore[assignment]
|
| 43 |
+
is_summary_writer_available = False
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class HFSummaryWriter(_RuntimeSummaryWriter):
|
| 47 |
+
"""
|
| 48 |
+
Wrapper around the tensorboard's `SummaryWriter` to push training logs to the Hub.
|
| 49 |
+
|
| 50 |
+
Data is logged locally and then pushed to the Hub asynchronously. Pushing data to the Hub is done in a separate
|
| 51 |
+
thread to avoid blocking the training script. In particular, if the upload fails for any reason (e.g. a connection
|
| 52 |
+
issue), the main script will not be interrupted. Data is automatically pushed to the Hub every `commit_every`
|
| 53 |
+
minutes (default to every 5 minutes).
|
| 54 |
+
|
| 55 |
+
> [!WARNING]
|
| 56 |
+
> `HFSummaryWriter` is experimental. Its API is subject to change in the future without prior notice.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
repo_id (`str`):
|
| 60 |
+
The id of the repo to which the logs will be pushed.
|
| 61 |
+
logdir (`str`, *optional*):
|
| 62 |
+
The directory where the logs will be written. If not specified, a local directory will be created by the
|
| 63 |
+
underlying `SummaryWriter` object.
|
| 64 |
+
commit_every (`int` or `float`, *optional*):
|
| 65 |
+
The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes.
|
| 66 |
+
squash_history (`bool`, *optional*):
|
| 67 |
+
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
|
| 68 |
+
useful to avoid degraded performances on the repo when it grows too large.
|
| 69 |
+
repo_type (`str`, *optional*):
|
| 70 |
+
The type of the repo to which the logs will be pushed. Defaults to "model".
|
| 71 |
+
repo_revision (`str`, *optional*):
|
| 72 |
+
The revision of the repo to which the logs will be pushed. Defaults to "main".
|
| 73 |
+
repo_private (`bool`, *optional*):
|
| 74 |
+
Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
|
| 75 |
+
path_in_repo (`str`, *optional*):
|
| 76 |
+
The path to the folder in the repo where the logs will be pushed. Defaults to "tensorboard/".
|
| 77 |
+
repo_allow_patterns (`List[str]` or `str`, *optional*):
|
| 78 |
+
A list of patterns to include in the upload. Defaults to `"*.tfevents.*"`. Check out the
|
| 79 |
+
[upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
|
| 80 |
+
repo_ignore_patterns (`List[str]` or `str`, *optional*):
|
| 81 |
+
A list of patterns to exclude in the upload. Check out the
|
| 82 |
+
[upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
|
| 83 |
+
token (`str`, *optional*):
|
| 84 |
+
Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
|
| 85 |
+
details
|
| 86 |
+
kwargs:
|
| 87 |
+
Additional keyword arguments passed to `SummaryWriter`.
|
| 88 |
+
|
| 89 |
+
Examples:
|
| 90 |
+
```diff
|
| 91 |
+
# Taken from https://pytorch.org/docs/stable/tensorboard.html
|
| 92 |
+
- from torch.utils.tensorboard import SummaryWriter
|
| 93 |
+
+ from huggingface_hub import HFSummaryWriter
|
| 94 |
+
|
| 95 |
+
import numpy as np
|
| 96 |
+
|
| 97 |
+
- writer = SummaryWriter()
|
| 98 |
+
+ writer = HFSummaryWriter(repo_id="username/my-trained-model")
|
| 99 |
+
|
| 100 |
+
for n_iter in range(100):
|
| 101 |
+
writer.add_scalar('Loss/train', np.random.random(), n_iter)
|
| 102 |
+
writer.add_scalar('Loss/test', np.random.random(), n_iter)
|
| 103 |
+
writer.add_scalar('Accuracy/train', np.random.random(), n_iter)
|
| 104 |
+
writer.add_scalar('Accuracy/test', np.random.random(), n_iter)
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
```py
|
| 108 |
+
>>> from huggingface_hub import HFSummaryWriter
|
| 109 |
+
|
| 110 |
+
# Logs are automatically pushed every 15 minutes (5 by default) + when exiting the context manager
|
| 111 |
+
>>> with HFSummaryWriter(repo_id="test_hf_logger", commit_every=15) as logger:
|
| 112 |
+
... logger.add_scalar("a", 1)
|
| 113 |
+
... logger.add_scalar("b", 2)
|
| 114 |
+
```
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
@experimental
|
| 118 |
+
def __new__(cls, *args, **kwargs) -> "HFSummaryWriter":
|
| 119 |
+
if not is_summary_writer_available:
|
| 120 |
+
raise ImportError(
|
| 121 |
+
"You must have `tensorboard` installed to use `HFSummaryWriter`. Please run `pip install --upgrade"
|
| 122 |
+
" tensorboardX` first."
|
| 123 |
+
)
|
| 124 |
+
return super().__new__(cls)
|
| 125 |
+
|
| 126 |
+
def __init__(
|
| 127 |
+
self,
|
| 128 |
+
repo_id: str,
|
| 129 |
+
*,
|
| 130 |
+
logdir: Optional[str] = None,
|
| 131 |
+
commit_every: Union[int, float] = 5,
|
| 132 |
+
squash_history: bool = False,
|
| 133 |
+
repo_type: Optional[str] = None,
|
| 134 |
+
repo_revision: Optional[str] = None,
|
| 135 |
+
repo_private: Optional[bool] = None,
|
| 136 |
+
path_in_repo: Optional[str] = "tensorboard",
|
| 137 |
+
repo_allow_patterns: Optional[Union[List[str], str]] = "*.tfevents.*",
|
| 138 |
+
repo_ignore_patterns: Optional[Union[List[str], str]] = None,
|
| 139 |
+
token: Optional[str] = None,
|
| 140 |
+
**kwargs,
|
| 141 |
+
):
|
| 142 |
+
# Initialize SummaryWriter
|
| 143 |
+
super().__init__(logdir=logdir, **kwargs)
|
| 144 |
+
|
| 145 |
+
# Check logdir has been correctly initialized and fail early otherwise. In practice, SummaryWriter takes care of it.
|
| 146 |
+
if not isinstance(self.logdir, str):
|
| 147 |
+
raise ValueError(f"`self.logdir` must be a string. Got '{self.logdir}' of type {type(self.logdir)}.")
|
| 148 |
+
|
| 149 |
+
# Append logdir name to `path_in_repo`
|
| 150 |
+
if path_in_repo is None or path_in_repo == "":
|
| 151 |
+
path_in_repo = Path(self.logdir).name
|
| 152 |
+
else:
|
| 153 |
+
path_in_repo = path_in_repo.strip("/") + "/" + Path(self.logdir).name
|
| 154 |
+
|
| 155 |
+
# Initialize scheduler
|
| 156 |
+
self.scheduler = CommitScheduler(
|
| 157 |
+
folder_path=self.logdir,
|
| 158 |
+
path_in_repo=path_in_repo,
|
| 159 |
+
repo_id=repo_id,
|
| 160 |
+
repo_type=repo_type,
|
| 161 |
+
revision=repo_revision,
|
| 162 |
+
private=repo_private,
|
| 163 |
+
token=token,
|
| 164 |
+
allow_patterns=repo_allow_patterns,
|
| 165 |
+
ignore_patterns=repo_ignore_patterns,
|
| 166 |
+
every=commit_every,
|
| 167 |
+
squash_history=squash_history,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Exposing some high-level info at root level
|
| 171 |
+
self.repo_id = self.scheduler.repo_id
|
| 172 |
+
self.repo_type = self.scheduler.repo_type
|
| 173 |
+
self.repo_revision = self.scheduler.revision
|
| 174 |
+
|
| 175 |
+
# Add `hf-summary-writer` tag to the model card metadata
|
| 176 |
+
try:
|
| 177 |
+
card = ModelCard.load(repo_id_or_path=self.repo_id, repo_type=self.repo_type)
|
| 178 |
+
except EntryNotFoundError:
|
| 179 |
+
card = ModelCard("")
|
| 180 |
+
tags = card.data.get("tags", [])
|
| 181 |
+
if "hf-summary-writer" not in tags:
|
| 182 |
+
tags.append("hf-summary-writer")
|
| 183 |
+
card.data["tags"] = tags
|
| 184 |
+
card.push_to_hub(repo_id=self.repo_id, repo_type=self.repo_type)
|
| 185 |
+
|
| 186 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 187 |
+
"""Push to hub in a non-blocking way when exiting the logger's context manager."""
|
| 188 |
+
super().__exit__(exc_type, exc_val, exc_tb)
|
| 189 |
+
future = self.scheduler.trigger()
|
| 190 |
+
future.result()
|
venv/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py
ADDED
|
@@ -0,0 +1,755 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2024-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
import enum
|
| 16 |
+
import logging
|
| 17 |
+
import os
|
| 18 |
+
import queue
|
| 19 |
+
import shutil
|
| 20 |
+
import sys
|
| 21 |
+
import threading
|
| 22 |
+
import time
|
| 23 |
+
import traceback
|
| 24 |
+
from datetime import datetime
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from threading import Lock
|
| 27 |
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
| 28 |
+
from urllib.parse import quote
|
| 29 |
+
|
| 30 |
+
from . import constants
|
| 31 |
+
from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
|
| 32 |
+
from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
|
| 33 |
+
from .constants import DEFAULT_REVISION, REPO_TYPES
|
| 34 |
+
from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
|
| 35 |
+
from .utils._cache_manager import _format_size
|
| 36 |
+
from .utils._runtime import is_xet_available
|
| 37 |
+
from .utils.sha import sha_fileobj
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if TYPE_CHECKING:
|
| 41 |
+
from .hf_api import HfApi
|
| 42 |
+
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
|
| 45 |
+
WAITING_TIME_IF_NO_TASKS = 10 # seconds
|
| 46 |
+
MAX_NB_FILES_FETCH_UPLOAD_MODE = 100
|
| 47 |
+
COMMIT_SIZE_SCALE: List[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
|
| 48 |
+
|
| 49 |
+
UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos
|
| 50 |
+
UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload
|
| 51 |
+
|
| 52 |
+
# Repository limits (from https://huggingface.co/docs/hub/repositories-recommendations)
|
| 53 |
+
MAX_FILES_PER_REPO = 100_000 # Recommended maximum number of files per repository
|
| 54 |
+
MAX_FILES_PER_FOLDER = 10_000 # Recommended maximum number of files per folder
|
| 55 |
+
MAX_FILE_SIZE_GB = 50 # Hard limit for individual file size
|
| 56 |
+
RECOMMENDED_FILE_SIZE_GB = 20 # Recommended maximum for individual file size
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _validate_upload_limits(paths_list: List[LocalUploadFilePaths]) -> None:
|
| 60 |
+
"""
|
| 61 |
+
Validate upload against repository limits and warn about potential issues.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
paths_list: List of file paths to be uploaded
|
| 65 |
+
|
| 66 |
+
Warns about:
|
| 67 |
+
- Too many files in the repository (>100k)
|
| 68 |
+
- Too many entries (files or subdirectories) in a single folder (>10k)
|
| 69 |
+
- Files exceeding size limits (>20GB recommended, >50GB hard limit)
|
| 70 |
+
"""
|
| 71 |
+
logger.info("Running validation checks on files to upload...")
|
| 72 |
+
|
| 73 |
+
# Check 1: Total file count
|
| 74 |
+
if len(paths_list) > MAX_FILES_PER_REPO:
|
| 75 |
+
logger.warning(
|
| 76 |
+
f"You are about to upload {len(paths_list):,} files. "
|
| 77 |
+
f"This exceeds the recommended limit of {MAX_FILES_PER_REPO:,} files per repository.\n"
|
| 78 |
+
f"Consider:\n"
|
| 79 |
+
f" - Splitting your data into multiple repositories\n"
|
| 80 |
+
f" - Using fewer, larger files (e.g., parquet files)\n"
|
| 81 |
+
f" - See: https://huggingface.co/docs/hub/repositories-recommendations"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Check 2: Files and subdirectories per folder
|
| 85 |
+
# Track immediate children (files and subdirs) for each folder
|
| 86 |
+
from collections import defaultdict
|
| 87 |
+
|
| 88 |
+
entries_per_folder: Dict[str, Any] = defaultdict(lambda: {"files": 0, "subdirs": set()})
|
| 89 |
+
|
| 90 |
+
for paths in paths_list:
|
| 91 |
+
path = Path(paths.path_in_repo)
|
| 92 |
+
parts = path.parts
|
| 93 |
+
|
| 94 |
+
# Count this file in its immediate parent directory
|
| 95 |
+
parent = str(path.parent) if str(path.parent) != "." else "."
|
| 96 |
+
entries_per_folder[parent]["files"] += 1
|
| 97 |
+
|
| 98 |
+
# Track immediate subdirectories for each parent folder
|
| 99 |
+
# Walk through the path components to track parent-child relationships
|
| 100 |
+
for i, child in enumerate(parts[:-1]):
|
| 101 |
+
parent = "." if i == 0 else "/".join(parts[:i])
|
| 102 |
+
entries_per_folder[parent]["subdirs"].add(child)
|
| 103 |
+
|
| 104 |
+
# Check limits for each folder
|
| 105 |
+
for folder, data in entries_per_folder.items():
|
| 106 |
+
file_count = data["files"]
|
| 107 |
+
subdir_count = len(data["subdirs"])
|
| 108 |
+
total_entries = file_count + subdir_count
|
| 109 |
+
|
| 110 |
+
if total_entries > MAX_FILES_PER_FOLDER:
|
| 111 |
+
folder_display = "root" if folder == "." else folder
|
| 112 |
+
logger.warning(
|
| 113 |
+
f"Folder '{folder_display}' contains {total_entries:,} entries "
|
| 114 |
+
f"({file_count:,} files and {subdir_count:,} subdirectories). "
|
| 115 |
+
f"This exceeds the recommended {MAX_FILES_PER_FOLDER:,} entries per folder.\n"
|
| 116 |
+
"Consider reorganising into sub-folders."
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Check 3: File sizes
|
| 120 |
+
large_files = []
|
| 121 |
+
very_large_files = []
|
| 122 |
+
|
| 123 |
+
for paths in paths_list:
|
| 124 |
+
size = paths.file_path.stat().st_size
|
| 125 |
+
size_gb = size / 1_000_000_000 # Use decimal GB as per Hub limits
|
| 126 |
+
|
| 127 |
+
if size_gb > MAX_FILE_SIZE_GB:
|
| 128 |
+
very_large_files.append((paths.path_in_repo, size_gb))
|
| 129 |
+
elif size_gb > RECOMMENDED_FILE_SIZE_GB:
|
| 130 |
+
large_files.append((paths.path_in_repo, size_gb))
|
| 131 |
+
|
| 132 |
+
# Warn about very large files (>50GB)
|
| 133 |
+
if very_large_files:
|
| 134 |
+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in very_large_files[:5])
|
| 135 |
+
more_str = f"\n ... and {len(very_large_files) - 5} more files" if len(very_large_files) > 5 else ""
|
| 136 |
+
logger.warning(
|
| 137 |
+
f"Found {len(very_large_files)} files exceeding the {MAX_FILE_SIZE_GB}GB hard limit:\n"
|
| 138 |
+
f" - {files_str}{more_str}\n"
|
| 139 |
+
f"These files may fail to upload. Consider splitting them into smaller chunks."
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Warn about large files (>20GB)
|
| 143 |
+
if large_files:
|
| 144 |
+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in large_files[:5])
|
| 145 |
+
more_str = f"\n ... and {len(large_files) - 5} more files" if len(large_files) > 5 else ""
|
| 146 |
+
logger.warning(
|
| 147 |
+
f"Found {len(large_files)} files larger than {RECOMMENDED_FILE_SIZE_GB}GB (recommended limit):\n"
|
| 148 |
+
f" - {files_str}{more_str}\n"
|
| 149 |
+
f"Large files may slow down loading and processing."
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
logger.info("Validation checks complete.")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def upload_large_folder_internal(
|
| 156 |
+
api: "HfApi",
|
| 157 |
+
repo_id: str,
|
| 158 |
+
folder_path: Union[str, Path],
|
| 159 |
+
*,
|
| 160 |
+
repo_type: str, # Repo type is required!
|
| 161 |
+
revision: Optional[str] = None,
|
| 162 |
+
private: Optional[bool] = None,
|
| 163 |
+
allow_patterns: Optional[Union[List[str], str]] = None,
|
| 164 |
+
ignore_patterns: Optional[Union[List[str], str]] = None,
|
| 165 |
+
num_workers: Optional[int] = None,
|
| 166 |
+
print_report: bool = True,
|
| 167 |
+
print_report_every: int = 60,
|
| 168 |
+
):
|
| 169 |
+
"""Upload a large folder to the Hub in the most resilient way possible.
|
| 170 |
+
|
| 171 |
+
See [`HfApi.upload_large_folder`] for the full documentation.
|
| 172 |
+
"""
|
| 173 |
+
# 1. Check args and setup
|
| 174 |
+
if repo_type is None:
|
| 175 |
+
raise ValueError(
|
| 176 |
+
"For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`."
|
| 177 |
+
" If you are using the CLI, pass it as `--repo-type=model`."
|
| 178 |
+
)
|
| 179 |
+
if repo_type not in REPO_TYPES:
|
| 180 |
+
raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
|
| 181 |
+
if revision is None:
|
| 182 |
+
revision = DEFAULT_REVISION
|
| 183 |
+
|
| 184 |
+
folder_path = Path(folder_path).expanduser().resolve()
|
| 185 |
+
if not folder_path.is_dir():
|
| 186 |
+
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
|
| 187 |
+
|
| 188 |
+
if ignore_patterns is None:
|
| 189 |
+
ignore_patterns = []
|
| 190 |
+
elif isinstance(ignore_patterns, str):
|
| 191 |
+
ignore_patterns = [ignore_patterns]
|
| 192 |
+
ignore_patterns += DEFAULT_IGNORE_PATTERNS
|
| 193 |
+
|
| 194 |
+
if num_workers is None:
|
| 195 |
+
nb_cores = os.cpu_count() or 1
|
| 196 |
+
num_workers = max(nb_cores - 2, 2) # Use all but 2 cores, or at least 2 cores
|
| 197 |
+
|
| 198 |
+
# 2. Create repo if missing
|
| 199 |
+
repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
| 200 |
+
logger.info(f"Repo created: {repo_url}")
|
| 201 |
+
repo_id = repo_url.repo_id
|
| 202 |
+
# 2.1 Check if xet is enabled to set batch file upload size
|
| 203 |
+
is_xet_enabled = (
|
| 204 |
+
is_xet_available()
|
| 205 |
+
and api.repo_info(
|
| 206 |
+
repo_id=repo_id,
|
| 207 |
+
repo_type=repo_type,
|
| 208 |
+
revision=revision,
|
| 209 |
+
expand="xetEnabled",
|
| 210 |
+
).xet_enabled
|
| 211 |
+
)
|
| 212 |
+
upload_batch_size = UPLOAD_BATCH_SIZE_XET if is_xet_enabled else UPLOAD_BATCH_SIZE_LFS
|
| 213 |
+
|
| 214 |
+
# 3. List files to upload
|
| 215 |
+
filtered_paths_list = filter_repo_objects(
|
| 216 |
+
(path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()),
|
| 217 |
+
allow_patterns=allow_patterns,
|
| 218 |
+
ignore_patterns=ignore_patterns,
|
| 219 |
+
)
|
| 220 |
+
paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
|
| 221 |
+
logger.info(f"Found {len(paths_list)} candidate files to upload")
|
| 222 |
+
|
| 223 |
+
# Validate upload against repository limits
|
| 224 |
+
_validate_upload_limits(paths_list)
|
| 225 |
+
|
| 226 |
+
logger.info("Starting upload...")
|
| 227 |
+
|
| 228 |
+
# Read metadata for each file
|
| 229 |
+
items = [
|
| 230 |
+
(paths, read_upload_metadata(folder_path, paths.path_in_repo))
|
| 231 |
+
for paths in tqdm(paths_list, desc="Recovering from metadata files")
|
| 232 |
+
]
|
| 233 |
+
|
| 234 |
+
# 4. Start workers
|
| 235 |
+
status = LargeUploadStatus(items, upload_batch_size)
|
| 236 |
+
threads = [
|
| 237 |
+
threading.Thread(
|
| 238 |
+
target=_worker_job,
|
| 239 |
+
kwargs={
|
| 240 |
+
"status": status,
|
| 241 |
+
"api": api,
|
| 242 |
+
"repo_id": repo_id,
|
| 243 |
+
"repo_type": repo_type,
|
| 244 |
+
"revision": revision,
|
| 245 |
+
},
|
| 246 |
+
)
|
| 247 |
+
for _ in range(num_workers)
|
| 248 |
+
]
|
| 249 |
+
|
| 250 |
+
for thread in threads:
|
| 251 |
+
thread.start()
|
| 252 |
+
|
| 253 |
+
# 5. Print regular reports
|
| 254 |
+
if print_report:
|
| 255 |
+
print("\n\n" + status.current_report())
|
| 256 |
+
last_report_ts = time.time()
|
| 257 |
+
while True:
|
| 258 |
+
time.sleep(1)
|
| 259 |
+
if time.time() - last_report_ts >= print_report_every:
|
| 260 |
+
if print_report:
|
| 261 |
+
_print_overwrite(status.current_report())
|
| 262 |
+
last_report_ts = time.time()
|
| 263 |
+
if status.is_done():
|
| 264 |
+
logging.info("Is done: exiting main loop")
|
| 265 |
+
break
|
| 266 |
+
|
| 267 |
+
for thread in threads:
|
| 268 |
+
thread.join()
|
| 269 |
+
|
| 270 |
+
logger.info(status.current_report())
|
| 271 |
+
logging.info("Upload is complete!")
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
####################
|
| 275 |
+
# Logic to manage workers and synchronize tasks
|
| 276 |
+
####################
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
class WorkerJob(enum.Enum):
|
| 280 |
+
SHA256 = enum.auto()
|
| 281 |
+
GET_UPLOAD_MODE = enum.auto()
|
| 282 |
+
PREUPLOAD_LFS = enum.auto()
|
| 283 |
+
COMMIT = enum.auto()
|
| 284 |
+
WAIT = enum.auto() # if no tasks are available but we don't want to exit
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class LargeUploadStatus:
|
| 291 |
+
"""Contains information, queues and tasks for a large upload process."""
|
| 292 |
+
|
| 293 |
+
def __init__(self, items: List[JOB_ITEM_T], upload_batch_size: int = 1):
|
| 294 |
+
self.items = items
|
| 295 |
+
self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
| 296 |
+
self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
| 297 |
+
self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
| 298 |
+
self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
| 299 |
+
self.lock = Lock()
|
| 300 |
+
|
| 301 |
+
self.nb_workers_sha256: int = 0
|
| 302 |
+
self.nb_workers_get_upload_mode: int = 0
|
| 303 |
+
self.nb_workers_preupload_lfs: int = 0
|
| 304 |
+
self.upload_batch_size: int = upload_batch_size
|
| 305 |
+
self.nb_workers_commit: int = 0
|
| 306 |
+
self.nb_workers_waiting: int = 0
|
| 307 |
+
self.last_commit_attempt: Optional[float] = None
|
| 308 |
+
|
| 309 |
+
self._started_at = datetime.now()
|
| 310 |
+
self._chunk_idx: int = 1
|
| 311 |
+
self._chunk_lock: Lock = Lock()
|
| 312 |
+
|
| 313 |
+
# Setup queues
|
| 314 |
+
for item in self.items:
|
| 315 |
+
paths, metadata = item
|
| 316 |
+
if metadata.sha256 is None:
|
| 317 |
+
self.queue_sha256.put(item)
|
| 318 |
+
elif metadata.upload_mode is None:
|
| 319 |
+
self.queue_get_upload_mode.put(item)
|
| 320 |
+
elif metadata.upload_mode == "lfs" and not metadata.is_uploaded:
|
| 321 |
+
self.queue_preupload_lfs.put(item)
|
| 322 |
+
elif not metadata.is_committed:
|
| 323 |
+
self.queue_commit.put(item)
|
| 324 |
+
else:
|
| 325 |
+
logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)")
|
| 326 |
+
|
| 327 |
+
def target_chunk(self) -> int:
|
| 328 |
+
with self._chunk_lock:
|
| 329 |
+
return COMMIT_SIZE_SCALE[self._chunk_idx]
|
| 330 |
+
|
| 331 |
+
def update_chunk(self, success: bool, nb_items: int, duration: float) -> None:
|
| 332 |
+
with self._chunk_lock:
|
| 333 |
+
if not success:
|
| 334 |
+
logger.warning(f"Failed to commit {nb_items} files at once. Will retry with less files in next batch.")
|
| 335 |
+
self._chunk_idx -= 1
|
| 336 |
+
elif nb_items >= COMMIT_SIZE_SCALE[self._chunk_idx] and duration < 40:
|
| 337 |
+
logger.info(f"Successfully committed {nb_items} at once. Increasing the limit for next batch.")
|
| 338 |
+
self._chunk_idx += 1
|
| 339 |
+
|
| 340 |
+
self._chunk_idx = max(0, min(self._chunk_idx, len(COMMIT_SIZE_SCALE) - 1))
|
| 341 |
+
|
| 342 |
+
def current_report(self) -> str:
|
| 343 |
+
"""Generate a report of the current status of the large upload."""
|
| 344 |
+
nb_hashed = 0
|
| 345 |
+
size_hashed = 0
|
| 346 |
+
nb_preuploaded = 0
|
| 347 |
+
nb_lfs = 0
|
| 348 |
+
nb_lfs_unsure = 0
|
| 349 |
+
size_preuploaded = 0
|
| 350 |
+
nb_committed = 0
|
| 351 |
+
size_committed = 0
|
| 352 |
+
total_size = 0
|
| 353 |
+
ignored_files = 0
|
| 354 |
+
total_files = 0
|
| 355 |
+
|
| 356 |
+
with self.lock:
|
| 357 |
+
for _, metadata in self.items:
|
| 358 |
+
if metadata.should_ignore:
|
| 359 |
+
ignored_files += 1
|
| 360 |
+
continue
|
| 361 |
+
total_size += metadata.size
|
| 362 |
+
total_files += 1
|
| 363 |
+
if metadata.sha256 is not None:
|
| 364 |
+
nb_hashed += 1
|
| 365 |
+
size_hashed += metadata.size
|
| 366 |
+
if metadata.upload_mode == "lfs":
|
| 367 |
+
nb_lfs += 1
|
| 368 |
+
if metadata.upload_mode is None:
|
| 369 |
+
nb_lfs_unsure += 1
|
| 370 |
+
if metadata.is_uploaded:
|
| 371 |
+
nb_preuploaded += 1
|
| 372 |
+
size_preuploaded += metadata.size
|
| 373 |
+
if metadata.is_committed:
|
| 374 |
+
nb_committed += 1
|
| 375 |
+
size_committed += metadata.size
|
| 376 |
+
total_size_str = _format_size(total_size)
|
| 377 |
+
|
| 378 |
+
now = datetime.now()
|
| 379 |
+
now_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
| 380 |
+
elapsed = now - self._started_at
|
| 381 |
+
elapsed_str = str(elapsed).split(".")[0] # remove milliseconds
|
| 382 |
+
|
| 383 |
+
message = "\n" + "-" * 10
|
| 384 |
+
message += f" {now_str} ({elapsed_str}) "
|
| 385 |
+
message += "-" * 10 + "\n"
|
| 386 |
+
|
| 387 |
+
message += "Files: "
|
| 388 |
+
message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | "
|
| 389 |
+
message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})"
|
| 390 |
+
if nb_lfs_unsure > 0:
|
| 391 |
+
message += f" (+{nb_lfs_unsure} unsure)"
|
| 392 |
+
message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})"
|
| 393 |
+
message += f" | ignored: {ignored_files}\n"
|
| 394 |
+
|
| 395 |
+
message += "Workers: "
|
| 396 |
+
message += f"hashing: {self.nb_workers_sha256} | "
|
| 397 |
+
message += f"get upload mode: {self.nb_workers_get_upload_mode} | "
|
| 398 |
+
message += f"pre-uploading: {self.nb_workers_preupload_lfs} | "
|
| 399 |
+
message += f"committing: {self.nb_workers_commit} | "
|
| 400 |
+
message += f"waiting: {self.nb_workers_waiting}\n"
|
| 401 |
+
message += "-" * 51
|
| 402 |
+
|
| 403 |
+
return message
|
| 404 |
+
|
| 405 |
+
def is_done(self) -> bool:
|
| 406 |
+
with self.lock:
|
| 407 |
+
return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def _worker_job(
|
| 411 |
+
status: LargeUploadStatus,
|
| 412 |
+
api: "HfApi",
|
| 413 |
+
repo_id: str,
|
| 414 |
+
repo_type: str,
|
| 415 |
+
revision: str,
|
| 416 |
+
):
|
| 417 |
+
"""
|
| 418 |
+
Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded
|
| 419 |
+
and committed. If no tasks are available, the worker will wait for 10 seconds before checking again.
|
| 420 |
+
|
| 421 |
+
If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up.
|
| 422 |
+
|
| 423 |
+
Read `upload_large_folder` docstring for more information on how tasks are prioritized.
|
| 424 |
+
"""
|
| 425 |
+
while True:
|
| 426 |
+
next_job: Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]] = None
|
| 427 |
+
|
| 428 |
+
# Determine next task
|
| 429 |
+
next_job = _determine_next_job(status)
|
| 430 |
+
if next_job is None:
|
| 431 |
+
return
|
| 432 |
+
job, items = next_job
|
| 433 |
+
|
| 434 |
+
# Perform task
|
| 435 |
+
if job == WorkerJob.SHA256:
|
| 436 |
+
item = items[0] # single item
|
| 437 |
+
try:
|
| 438 |
+
_compute_sha256(item)
|
| 439 |
+
status.queue_get_upload_mode.put(item)
|
| 440 |
+
except KeyboardInterrupt:
|
| 441 |
+
raise
|
| 442 |
+
except Exception as e:
|
| 443 |
+
logger.error(f"Failed to compute sha256: {e}")
|
| 444 |
+
traceback.format_exc()
|
| 445 |
+
status.queue_sha256.put(item)
|
| 446 |
+
|
| 447 |
+
with status.lock:
|
| 448 |
+
status.nb_workers_sha256 -= 1
|
| 449 |
+
|
| 450 |
+
elif job == WorkerJob.GET_UPLOAD_MODE:
|
| 451 |
+
try:
|
| 452 |
+
_get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
| 453 |
+
except KeyboardInterrupt:
|
| 454 |
+
raise
|
| 455 |
+
except Exception as e:
|
| 456 |
+
logger.error(f"Failed to get upload mode: {e}")
|
| 457 |
+
traceback.format_exc()
|
| 458 |
+
|
| 459 |
+
# Items are either:
|
| 460 |
+
# - dropped (if should_ignore)
|
| 461 |
+
# - put in LFS queue (if LFS)
|
| 462 |
+
# - put in commit queue (if regular)
|
| 463 |
+
# - or put back (if error occurred).
|
| 464 |
+
for item in items:
|
| 465 |
+
_, metadata = item
|
| 466 |
+
if metadata.should_ignore:
|
| 467 |
+
continue
|
| 468 |
+
if metadata.upload_mode == "lfs":
|
| 469 |
+
status.queue_preupload_lfs.put(item)
|
| 470 |
+
elif metadata.upload_mode == "regular":
|
| 471 |
+
status.queue_commit.put(item)
|
| 472 |
+
else:
|
| 473 |
+
status.queue_get_upload_mode.put(item)
|
| 474 |
+
|
| 475 |
+
with status.lock:
|
| 476 |
+
status.nb_workers_get_upload_mode -= 1
|
| 477 |
+
|
| 478 |
+
elif job == WorkerJob.PREUPLOAD_LFS:
|
| 479 |
+
try:
|
| 480 |
+
_preupload_lfs(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
| 481 |
+
for item in items:
|
| 482 |
+
status.queue_commit.put(item)
|
| 483 |
+
except KeyboardInterrupt:
|
| 484 |
+
raise
|
| 485 |
+
except Exception as e:
|
| 486 |
+
logger.error(f"Failed to preupload LFS: {e}")
|
| 487 |
+
traceback.format_exc()
|
| 488 |
+
for item in items:
|
| 489 |
+
status.queue_preupload_lfs.put(item)
|
| 490 |
+
|
| 491 |
+
with status.lock:
|
| 492 |
+
status.nb_workers_preupload_lfs -= 1
|
| 493 |
+
|
| 494 |
+
elif job == WorkerJob.COMMIT:
|
| 495 |
+
start_ts = time.time()
|
| 496 |
+
success = True
|
| 497 |
+
try:
|
| 498 |
+
_commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
| 499 |
+
except KeyboardInterrupt:
|
| 500 |
+
raise
|
| 501 |
+
except Exception as e:
|
| 502 |
+
logger.error(f"Failed to commit: {e}")
|
| 503 |
+
traceback.format_exc()
|
| 504 |
+
for item in items:
|
| 505 |
+
status.queue_commit.put(item)
|
| 506 |
+
success = False
|
| 507 |
+
duration = time.time() - start_ts
|
| 508 |
+
status.update_chunk(success, len(items), duration)
|
| 509 |
+
with status.lock:
|
| 510 |
+
status.last_commit_attempt = time.time()
|
| 511 |
+
status.nb_workers_commit -= 1
|
| 512 |
+
|
| 513 |
+
elif job == WorkerJob.WAIT:
|
| 514 |
+
time.sleep(WAITING_TIME_IF_NO_TASKS)
|
| 515 |
+
with status.lock:
|
| 516 |
+
status.nb_workers_waiting -= 1
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]]:
|
| 520 |
+
with status.lock:
|
| 521 |
+
# 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
|
| 522 |
+
if (
|
| 523 |
+
status.nb_workers_commit == 0
|
| 524 |
+
and status.queue_commit.qsize() > 0
|
| 525 |
+
and status.last_commit_attempt is not None
|
| 526 |
+
and time.time() - status.last_commit_attempt > 5 * 60
|
| 527 |
+
):
|
| 528 |
+
status.nb_workers_commit += 1
|
| 529 |
+
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
|
| 530 |
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
| 531 |
+
|
| 532 |
+
# 2. Commit if at least 100 files are ready to commit
|
| 533 |
+
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
|
| 534 |
+
status.nb_workers_commit += 1
|
| 535 |
+
logger.debug("Job: commit (>100 files ready)")
|
| 536 |
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
| 537 |
+
|
| 538 |
+
# 3. Get upload mode if at least 100 files
|
| 539 |
+
elif status.queue_get_upload_mode.qsize() >= MAX_NB_FILES_FETCH_UPLOAD_MODE:
|
| 540 |
+
status.nb_workers_get_upload_mode += 1
|
| 541 |
+
logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)")
|
| 542 |
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
| 543 |
+
|
| 544 |
+
# 4. Preupload LFS file if at least `status.upload_batch_size` files and no worker is preuploading LFS
|
| 545 |
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and status.nb_workers_preupload_lfs == 0:
|
| 546 |
+
status.nb_workers_preupload_lfs += 1
|
| 547 |
+
logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
|
| 548 |
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
| 549 |
+
|
| 550 |
+
# 5. Compute sha256 if at least 1 file and no worker is computing sha256
|
| 551 |
+
elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
|
| 552 |
+
status.nb_workers_sha256 += 1
|
| 553 |
+
logger.debug("Job: sha256 (no other worker computing sha256)")
|
| 554 |
+
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
| 555 |
+
|
| 556 |
+
# 6. Get upload mode if at least 1 file and no worker is getting upload mode
|
| 557 |
+
elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
|
| 558 |
+
status.nb_workers_get_upload_mode += 1
|
| 559 |
+
logger.debug("Job: get upload mode (no other worker getting upload mode)")
|
| 560 |
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
| 561 |
+
|
| 562 |
+
# 7. Preupload LFS file if at least `status.upload_batch_size` files
|
| 563 |
+
# Skip if hf_transfer is enabled and there is already a worker preuploading LFS
|
| 564 |
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and (
|
| 565 |
+
status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
|
| 566 |
+
):
|
| 567 |
+
status.nb_workers_preupload_lfs += 1
|
| 568 |
+
logger.debug("Job: preupload LFS")
|
| 569 |
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
| 570 |
+
|
| 571 |
+
# 8. Compute sha256 if at least 1 file
|
| 572 |
+
elif status.queue_sha256.qsize() > 0:
|
| 573 |
+
status.nb_workers_sha256 += 1
|
| 574 |
+
logger.debug("Job: sha256")
|
| 575 |
+
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
| 576 |
+
|
| 577 |
+
# 9. Get upload mode if at least 1 file
|
| 578 |
+
elif status.queue_get_upload_mode.qsize() > 0:
|
| 579 |
+
status.nb_workers_get_upload_mode += 1
|
| 580 |
+
logger.debug("Job: get upload mode")
|
| 581 |
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
| 582 |
+
|
| 583 |
+
# 10. Preupload LFS file if at least 1 file
|
| 584 |
+
elif status.queue_preupload_lfs.qsize() > 0:
|
| 585 |
+
status.nb_workers_preupload_lfs += 1
|
| 586 |
+
logger.debug("Job: preupload LFS")
|
| 587 |
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
| 588 |
+
|
| 589 |
+
# 11. Commit if at least 1 file and 1 min since last commit attempt
|
| 590 |
+
elif (
|
| 591 |
+
status.nb_workers_commit == 0
|
| 592 |
+
and status.queue_commit.qsize() > 0
|
| 593 |
+
and status.last_commit_attempt is not None
|
| 594 |
+
and time.time() - status.last_commit_attempt > 1 * 60
|
| 595 |
+
):
|
| 596 |
+
status.nb_workers_commit += 1
|
| 597 |
+
logger.debug("Job: commit (1 min since last commit attempt)")
|
| 598 |
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
| 599 |
+
|
| 600 |
+
# 12. Commit if at least 1 file all other queues are empty and all workers are waiting
|
| 601 |
+
# e.g. when it's the last commit
|
| 602 |
+
elif (
|
| 603 |
+
status.nb_workers_commit == 0
|
| 604 |
+
and status.queue_commit.qsize() > 0
|
| 605 |
+
and status.queue_sha256.qsize() == 0
|
| 606 |
+
and status.queue_get_upload_mode.qsize() == 0
|
| 607 |
+
and status.queue_preupload_lfs.qsize() == 0
|
| 608 |
+
and status.nb_workers_sha256 == 0
|
| 609 |
+
and status.nb_workers_get_upload_mode == 0
|
| 610 |
+
and status.nb_workers_preupload_lfs == 0
|
| 611 |
+
):
|
| 612 |
+
status.nb_workers_commit += 1
|
| 613 |
+
logger.debug("Job: commit")
|
| 614 |
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
| 615 |
+
|
| 616 |
+
# 13. If all queues are empty, exit
|
| 617 |
+
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
|
| 618 |
+
logger.info("All files have been processed! Exiting worker.")
|
| 619 |
+
return None
|
| 620 |
+
|
| 621 |
+
# 14. If no task is available, wait
|
| 622 |
+
else:
|
| 623 |
+
status.nb_workers_waiting += 1
|
| 624 |
+
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
|
| 625 |
+
return (WorkerJob.WAIT, [])
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
####################
|
| 629 |
+
# Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit)
|
| 630 |
+
####################
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
def _compute_sha256(item: JOB_ITEM_T) -> None:
|
| 634 |
+
"""Compute sha256 of a file and save it in metadata."""
|
| 635 |
+
paths, metadata = item
|
| 636 |
+
if metadata.sha256 is None:
|
| 637 |
+
with paths.file_path.open("rb") as f:
|
| 638 |
+
metadata.sha256 = sha_fileobj(f).hex()
|
| 639 |
+
metadata.save(paths)
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
| 643 |
+
"""Get upload mode for each file and update metadata.
|
| 644 |
+
|
| 645 |
+
Also receive info if the file should be ignored.
|
| 646 |
+
"""
|
| 647 |
+
additions = [_build_hacky_operation(item) for item in items]
|
| 648 |
+
_fetch_upload_modes(
|
| 649 |
+
additions=additions,
|
| 650 |
+
repo_type=repo_type,
|
| 651 |
+
repo_id=repo_id,
|
| 652 |
+
headers=api._build_hf_headers(),
|
| 653 |
+
revision=quote(revision, safe=""),
|
| 654 |
+
endpoint=api.endpoint,
|
| 655 |
+
)
|
| 656 |
+
for item, addition in zip(items, additions):
|
| 657 |
+
paths, metadata = item
|
| 658 |
+
metadata.upload_mode = addition._upload_mode
|
| 659 |
+
metadata.should_ignore = addition._should_ignore
|
| 660 |
+
metadata.remote_oid = addition._remote_oid
|
| 661 |
+
metadata.save(paths)
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
def _preupload_lfs(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
| 665 |
+
"""Preupload LFS files and update metadata."""
|
| 666 |
+
additions = [_build_hacky_operation(item) for item in items]
|
| 667 |
+
api.preupload_lfs_files(
|
| 668 |
+
repo_id=repo_id,
|
| 669 |
+
repo_type=repo_type,
|
| 670 |
+
revision=revision,
|
| 671 |
+
additions=additions,
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
for paths, metadata in items:
|
| 675 |
+
metadata.is_uploaded = True
|
| 676 |
+
metadata.save(paths)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
| 680 |
+
"""Commit files to the repo."""
|
| 681 |
+
additions = [_build_hacky_operation(item) for item in items]
|
| 682 |
+
api.create_commit(
|
| 683 |
+
repo_id=repo_id,
|
| 684 |
+
repo_type=repo_type,
|
| 685 |
+
revision=revision,
|
| 686 |
+
operations=additions,
|
| 687 |
+
commit_message="Add files using upload-large-folder tool",
|
| 688 |
+
)
|
| 689 |
+
for paths, metadata in items:
|
| 690 |
+
metadata.is_committed = True
|
| 691 |
+
metadata.save(paths)
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
####################
|
| 695 |
+
# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
|
| 696 |
+
####################
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
class HackyCommitOperationAdd(CommitOperationAdd):
|
| 700 |
+
def __post_init__(self) -> None:
|
| 701 |
+
if isinstance(self.path_or_fileobj, Path):
|
| 702 |
+
self.path_or_fileobj = str(self.path_or_fileobj)
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
|
| 706 |
+
paths, metadata = item
|
| 707 |
+
operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path)
|
| 708 |
+
with paths.file_path.open("rb") as file:
|
| 709 |
+
sample = file.peek(512)[:512]
|
| 710 |
+
if metadata.sha256 is None:
|
| 711 |
+
raise ValueError("sha256 must have been computed by now!")
|
| 712 |
+
operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
|
| 713 |
+
operation._upload_mode = metadata.upload_mode # type: ignore[assignment]
|
| 714 |
+
operation._should_ignore = metadata.should_ignore
|
| 715 |
+
operation._remote_oid = metadata.remote_oid
|
| 716 |
+
return operation
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
####################
|
| 720 |
+
# Misc helpers
|
| 721 |
+
####################
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
|
| 725 |
+
return [queue.get()]
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
|
| 729 |
+
return [queue.get() for _ in range(min(queue.qsize(), n))]
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
def _print_overwrite(report: str) -> None:
|
| 733 |
+
"""Print a report, overwriting the previous lines.
|
| 734 |
+
|
| 735 |
+
Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout`
|
| 736 |
+
to print the report.
|
| 737 |
+
|
| 738 |
+
Note: works well only if no other process is writing to `sys.stdout`!
|
| 739 |
+
"""
|
| 740 |
+
report += "\n"
|
| 741 |
+
# Get terminal width
|
| 742 |
+
terminal_width = shutil.get_terminal_size().columns
|
| 743 |
+
|
| 744 |
+
# Count number of lines that should be cleared
|
| 745 |
+
nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines())
|
| 746 |
+
|
| 747 |
+
# Clear previous lines based on the number of lines in the report
|
| 748 |
+
for _ in range(nb_lines):
|
| 749 |
+
sys.stdout.write("\r\033[K") # Clear line
|
| 750 |
+
sys.stdout.write("\033[F") # Move cursor up one line
|
| 751 |
+
|
| 752 |
+
# Print the new report, filling remaining space with whitespace
|
| 753 |
+
sys.stdout.write(report)
|
| 754 |
+
sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1])))
|
| 755 |
+
sys.stdout.flush()
|
venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_payload.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2023-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""Contains data structures to parse the webhooks payload."""
|
| 16 |
+
|
| 17 |
+
from typing import List, Literal, Optional
|
| 18 |
+
|
| 19 |
+
from .utils import is_pydantic_available
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if is_pydantic_available():
|
| 23 |
+
from pydantic import BaseModel
|
| 24 |
+
else:
|
| 25 |
+
# Define a dummy BaseModel to avoid import errors when pydantic is not installed
|
| 26 |
+
# Import error will be raised when trying to use the class
|
| 27 |
+
|
| 28 |
+
class BaseModel: # type: ignore [no-redef]
|
| 29 |
+
def __init__(self, *args, **kwargs) -> None:
|
| 30 |
+
raise ImportError(
|
| 31 |
+
"You must have `pydantic` installed to use `WebhookPayload`. This is an optional dependency that"
|
| 32 |
+
" should be installed separately. Please run `pip install --upgrade pydantic` and retry."
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# This is an adaptation of the ReportV3 interface implemented in moon-landing. V0, V1 and V2 have been ignored as they
|
| 37 |
+
# are not in used anymore. To keep in sync when format is updated in
|
| 38 |
+
# https://github.com/huggingface/moon-landing/blob/main/server/lib/HFWebhooks.ts (internal link).
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
WebhookEvent_T = Literal[
|
| 42 |
+
"create",
|
| 43 |
+
"delete",
|
| 44 |
+
"move",
|
| 45 |
+
"update",
|
| 46 |
+
]
|
| 47 |
+
RepoChangeEvent_T = Literal[
|
| 48 |
+
"add",
|
| 49 |
+
"move",
|
| 50 |
+
"remove",
|
| 51 |
+
"update",
|
| 52 |
+
]
|
| 53 |
+
RepoType_T = Literal[
|
| 54 |
+
"dataset",
|
| 55 |
+
"model",
|
| 56 |
+
"space",
|
| 57 |
+
]
|
| 58 |
+
DiscussionStatus_T = Literal[
|
| 59 |
+
"closed",
|
| 60 |
+
"draft",
|
| 61 |
+
"open",
|
| 62 |
+
"merged",
|
| 63 |
+
]
|
| 64 |
+
SupportedWebhookVersion = Literal[3]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ObjectId(BaseModel):
|
| 68 |
+
id: str
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class WebhookPayloadUrl(BaseModel):
|
| 72 |
+
web: str
|
| 73 |
+
api: Optional[str] = None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class WebhookPayloadMovedTo(BaseModel):
|
| 77 |
+
name: str
|
| 78 |
+
owner: ObjectId
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class WebhookPayloadWebhook(ObjectId):
|
| 82 |
+
version: SupportedWebhookVersion
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class WebhookPayloadEvent(BaseModel):
|
| 86 |
+
action: WebhookEvent_T
|
| 87 |
+
scope: str
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class WebhookPayloadDiscussionChanges(BaseModel):
|
| 91 |
+
base: str
|
| 92 |
+
mergeCommitId: Optional[str] = None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class WebhookPayloadComment(ObjectId):
|
| 96 |
+
author: ObjectId
|
| 97 |
+
hidden: bool
|
| 98 |
+
content: Optional[str] = None
|
| 99 |
+
url: WebhookPayloadUrl
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class WebhookPayloadDiscussion(ObjectId):
|
| 103 |
+
num: int
|
| 104 |
+
author: ObjectId
|
| 105 |
+
url: WebhookPayloadUrl
|
| 106 |
+
title: str
|
| 107 |
+
isPullRequest: bool
|
| 108 |
+
status: DiscussionStatus_T
|
| 109 |
+
changes: Optional[WebhookPayloadDiscussionChanges] = None
|
| 110 |
+
pinned: Optional[bool] = None
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class WebhookPayloadRepo(ObjectId):
|
| 114 |
+
owner: ObjectId
|
| 115 |
+
head_sha: Optional[str] = None
|
| 116 |
+
name: str
|
| 117 |
+
private: bool
|
| 118 |
+
subdomain: Optional[str] = None
|
| 119 |
+
tags: Optional[List[str]] = None
|
| 120 |
+
type: Literal["dataset", "model", "space"]
|
| 121 |
+
url: WebhookPayloadUrl
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class WebhookPayloadUpdatedRef(BaseModel):
|
| 125 |
+
ref: str
|
| 126 |
+
oldSha: Optional[str] = None
|
| 127 |
+
newSha: Optional[str] = None
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class WebhookPayload(BaseModel):
|
| 131 |
+
event: WebhookPayloadEvent
|
| 132 |
+
repo: WebhookPayloadRepo
|
| 133 |
+
discussion: Optional[WebhookPayloadDiscussion] = None
|
| 134 |
+
comment: Optional[WebhookPayloadComment] = None
|
| 135 |
+
webhook: WebhookPayloadWebhook
|
| 136 |
+
movedTo: Optional[WebhookPayloadMovedTo] = None
|
| 137 |
+
updatedRefs: Optional[List[WebhookPayloadUpdatedRef]] = None
|
venv/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2023-present, the HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""Contains `WebhooksServer` and `webhook_endpoint` to create a webhook server easily."""
|
| 16 |
+
|
| 17 |
+
import atexit
|
| 18 |
+
import inspect
|
| 19 |
+
import os
|
| 20 |
+
from functools import wraps
|
| 21 |
+
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
|
| 22 |
+
|
| 23 |
+
from .utils import experimental, is_fastapi_available, is_gradio_available
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
if TYPE_CHECKING:
|
| 27 |
+
import gradio as gr
|
| 28 |
+
from fastapi import Request
|
| 29 |
+
|
| 30 |
+
if is_fastapi_available():
|
| 31 |
+
from fastapi import FastAPI, Request
|
| 32 |
+
from fastapi.responses import JSONResponse
|
| 33 |
+
else:
|
| 34 |
+
# Will fail at runtime if FastAPI is not available
|
| 35 |
+
FastAPI = Request = JSONResponse = None # type: ignore
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
_global_app: Optional["WebhooksServer"] = None
|
| 39 |
+
_is_local = os.environ.get("SPACE_ID") is None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@experimental
|
| 43 |
+
class WebhooksServer:
|
| 44 |
+
"""
|
| 45 |
+
The [`WebhooksServer`] class lets you create an instance of a Gradio app that can receive Huggingface webhooks.
|
| 46 |
+
These webhooks can be registered using the [`~WebhooksServer.add_webhook`] decorator. Webhook endpoints are added to
|
| 47 |
+
the app as a POST endpoint to the FastAPI router. Once all the webhooks are registered, the `launch` method has to be
|
| 48 |
+
called to start the app.
|
| 49 |
+
|
| 50 |
+
It is recommended to accept [`WebhookPayload`] as the first argument of the webhook function. It is a Pydantic
|
| 51 |
+
model that contains all the information about the webhook event. The data will be parsed automatically for you.
|
| 52 |
+
|
| 53 |
+
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
|
| 54 |
+
WebhooksServer and deploy it on a Space.
|
| 55 |
+
|
| 56 |
+
> [!WARNING]
|
| 57 |
+
> `WebhooksServer` is experimental. Its API is subject to change in the future.
|
| 58 |
+
|
| 59 |
+
> [!WARNING]
|
| 60 |
+
> You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`).
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
ui (`gradio.Blocks`, optional):
|
| 64 |
+
A Gradio UI instance to be used as the Space landing page. If `None`, a UI displaying instructions
|
| 65 |
+
about the configured webhooks is created.
|
| 66 |
+
webhook_secret (`str`, optional):
|
| 67 |
+
A secret key to verify incoming webhook requests. You can set this value to any secret you want as long as
|
| 68 |
+
you also configure it in your [webhooks settings panel](https://huggingface.co/settings/webhooks). You
|
| 69 |
+
can also set this value as the `WEBHOOK_SECRET` environment variable. If no secret is provided, the
|
| 70 |
+
webhook endpoints are opened without any security.
|
| 71 |
+
|
| 72 |
+
Example:
|
| 73 |
+
|
| 74 |
+
```python
|
| 75 |
+
import gradio as gr
|
| 76 |
+
from huggingface_hub import WebhooksServer, WebhookPayload
|
| 77 |
+
|
| 78 |
+
with gr.Blocks() as ui:
|
| 79 |
+
...
|
| 80 |
+
|
| 81 |
+
app = WebhooksServer(ui=ui, webhook_secret="my_secret_key")
|
| 82 |
+
|
| 83 |
+
@app.add_webhook("/say_hello")
|
| 84 |
+
async def hello(payload: WebhookPayload):
|
| 85 |
+
return {"message": "hello"}
|
| 86 |
+
|
| 87 |
+
app.launch()
|
| 88 |
+
```
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
def __new__(cls, *args, **kwargs) -> "WebhooksServer":
|
| 92 |
+
if not is_gradio_available():
|
| 93 |
+
raise ImportError(
|
| 94 |
+
"You must have `gradio` installed to use `WebhooksServer`. Please run `pip install --upgrade gradio`"
|
| 95 |
+
" first."
|
| 96 |
+
)
|
| 97 |
+
if not is_fastapi_available():
|
| 98 |
+
raise ImportError(
|
| 99 |
+
"You must have `fastapi` installed to use `WebhooksServer`. Please run `pip install --upgrade fastapi`"
|
| 100 |
+
" first."
|
| 101 |
+
)
|
| 102 |
+
return super().__new__(cls)
|
| 103 |
+
|
| 104 |
+
def __init__(
|
| 105 |
+
self,
|
| 106 |
+
ui: Optional["gr.Blocks"] = None,
|
| 107 |
+
webhook_secret: Optional[str] = None,
|
| 108 |
+
) -> None:
|
| 109 |
+
self._ui = ui
|
| 110 |
+
|
| 111 |
+
self.webhook_secret = webhook_secret or os.getenv("WEBHOOK_SECRET")
|
| 112 |
+
self.registered_webhooks: Dict[str, Callable] = {}
|
| 113 |
+
_warn_on_empty_secret(self.webhook_secret)
|
| 114 |
+
|
| 115 |
+
def add_webhook(self, path: Optional[str] = None) -> Callable:
|
| 116 |
+
"""
|
| 117 |
+
Decorator to add a webhook to the [`WebhooksServer`] server.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
path (`str`, optional):
|
| 121 |
+
The URL path to register the webhook function. If not provided, the function name will be used as the
|
| 122 |
+
path. In any case, all webhooks are registered under `/webhooks`.
|
| 123 |
+
|
| 124 |
+
Raises:
|
| 125 |
+
ValueError: If the provided path is already registered as a webhook.
|
| 126 |
+
|
| 127 |
+
Example:
|
| 128 |
+
```python
|
| 129 |
+
from huggingface_hub import WebhooksServer, WebhookPayload
|
| 130 |
+
|
| 131 |
+
app = WebhooksServer()
|
| 132 |
+
|
| 133 |
+
@app.add_webhook
|
| 134 |
+
async def trigger_training(payload: WebhookPayload):
|
| 135 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 136 |
+
# Trigger a training job if a dataset is updated
|
| 137 |
+
...
|
| 138 |
+
|
| 139 |
+
app.launch()
|
| 140 |
+
```
|
| 141 |
+
"""
|
| 142 |
+
# Usage: directly as decorator. Example: `@app.add_webhook`
|
| 143 |
+
if callable(path):
|
| 144 |
+
# If path is a function, it means it was used as a decorator without arguments
|
| 145 |
+
return self.add_webhook()(path)
|
| 146 |
+
|
| 147 |
+
# Usage: provide a path. Example: `@app.add_webhook(...)`
|
| 148 |
+
@wraps(FastAPI.post)
|
| 149 |
+
def _inner_post(*args, **kwargs):
|
| 150 |
+
func = args[0]
|
| 151 |
+
abs_path = f"/webhooks/{(path or func.__name__).strip('/')}"
|
| 152 |
+
if abs_path in self.registered_webhooks:
|
| 153 |
+
raise ValueError(f"Webhook {abs_path} already exists.")
|
| 154 |
+
self.registered_webhooks[abs_path] = func
|
| 155 |
+
|
| 156 |
+
return _inner_post
|
| 157 |
+
|
| 158 |
+
def launch(self, prevent_thread_lock: bool = False, **launch_kwargs: Any) -> None:
|
| 159 |
+
"""Launch the Gradio app and register webhooks to the underlying FastAPI server.
|
| 160 |
+
|
| 161 |
+
Input parameters are forwarded to Gradio when launching the app.
|
| 162 |
+
"""
|
| 163 |
+
ui = self._ui or self._get_default_ui()
|
| 164 |
+
|
| 165 |
+
# Start Gradio App
|
| 166 |
+
# - as non-blocking so that webhooks can be added afterwards
|
| 167 |
+
# - as shared if launch locally (to debug webhooks)
|
| 168 |
+
launch_kwargs.setdefault("share", _is_local)
|
| 169 |
+
self.fastapi_app, _, _ = ui.launch(prevent_thread_lock=True, **launch_kwargs)
|
| 170 |
+
|
| 171 |
+
# Register webhooks to FastAPI app
|
| 172 |
+
for path, func in self.registered_webhooks.items():
|
| 173 |
+
# Add secret check if required
|
| 174 |
+
if self.webhook_secret is not None:
|
| 175 |
+
func = _wrap_webhook_to_check_secret(func, webhook_secret=self.webhook_secret)
|
| 176 |
+
|
| 177 |
+
# Add route to FastAPI app
|
| 178 |
+
self.fastapi_app.post(path)(func)
|
| 179 |
+
|
| 180 |
+
# Print instructions and block main thread
|
| 181 |
+
space_host = os.environ.get("SPACE_HOST")
|
| 182 |
+
url = "https://" + space_host if space_host is not None else (ui.share_url or ui.local_url)
|
| 183 |
+
if url is None:
|
| 184 |
+
raise ValueError("Cannot find the URL of the app. Please provide a valid `ui` or update `gradio` version.")
|
| 185 |
+
url = url.strip("/")
|
| 186 |
+
message = "\nWebhooks are correctly setup and ready to use:"
|
| 187 |
+
message += "\n" + "\n".join(f" - POST {url}{webhook}" for webhook in self.registered_webhooks)
|
| 188 |
+
message += "\nGo to https://huggingface.co/settings/webhooks to setup your webhooks."
|
| 189 |
+
print(message)
|
| 190 |
+
|
| 191 |
+
if not prevent_thread_lock:
|
| 192 |
+
ui.block_thread()
|
| 193 |
+
|
| 194 |
+
def _get_default_ui(self) -> "gr.Blocks":
|
| 195 |
+
"""Default UI if not provided (lists webhooks and provides basic instructions)."""
|
| 196 |
+
import gradio as gr
|
| 197 |
+
|
| 198 |
+
with gr.Blocks() as ui:
|
| 199 |
+
gr.Markdown("# This is an app to process 🤗 Webhooks")
|
| 200 |
+
gr.Markdown(
|
| 201 |
+
"Webhooks are a foundation for MLOps-related features. They allow you to listen for new changes on"
|
| 202 |
+
" specific repos or to all repos belonging to particular set of users/organizations (not just your"
|
| 203 |
+
" repos, but any repo). Check out this [guide](https://huggingface.co/docs/hub/webhooks) to get to"
|
| 204 |
+
" know more about webhooks on the Huggingface Hub."
|
| 205 |
+
)
|
| 206 |
+
gr.Markdown(
|
| 207 |
+
f"{len(self.registered_webhooks)} webhook(s) are registered:"
|
| 208 |
+
+ "\n\n"
|
| 209 |
+
+ "\n ".join(
|
| 210 |
+
f"- [{webhook_path}]({_get_webhook_doc_url(webhook.__name__, webhook_path)})"
|
| 211 |
+
for webhook_path, webhook in self.registered_webhooks.items()
|
| 212 |
+
)
|
| 213 |
+
)
|
| 214 |
+
gr.Markdown(
|
| 215 |
+
"Go to https://huggingface.co/settings/webhooks to setup your webhooks."
|
| 216 |
+
+ "\nYou app is running locally. Please look at the logs to check the full URL you need to set."
|
| 217 |
+
if _is_local
|
| 218 |
+
else (
|
| 219 |
+
"\nThis app is running on a Space. You can find the corresponding URL in the options menu"
|
| 220 |
+
" (top-right) > 'Embed the Space'. The URL looks like 'https://{username}-{repo_name}.hf.space'."
|
| 221 |
+
)
|
| 222 |
+
)
|
| 223 |
+
return ui
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@experimental
|
| 227 |
+
def webhook_endpoint(path: Optional[str] = None) -> Callable:
|
| 228 |
+
"""Decorator to start a [`WebhooksServer`] and register the decorated function as a webhook endpoint.
|
| 229 |
+
|
| 230 |
+
This is a helper to get started quickly. If you need more flexibility (custom landing page or webhook secret),
|
| 231 |
+
you can use [`WebhooksServer`] directly. You can register multiple webhook endpoints (to the same server) by using
|
| 232 |
+
this decorator multiple times.
|
| 233 |
+
|
| 234 |
+
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
|
| 235 |
+
server and deploy it on a Space.
|
| 236 |
+
|
| 237 |
+
> [!WARNING]
|
| 238 |
+
> `webhook_endpoint` is experimental. Its API is subject to change in the future.
|
| 239 |
+
|
| 240 |
+
> [!WARNING]
|
| 241 |
+
> You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`).
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
path (`str`, optional):
|
| 245 |
+
The URL path to register the webhook function. If not provided, the function name will be used as the path.
|
| 246 |
+
In any case, all webhooks are registered under `/webhooks`.
|
| 247 |
+
|
| 248 |
+
Examples:
|
| 249 |
+
The default usage is to register a function as a webhook endpoint. The function name will be used as the path.
|
| 250 |
+
The server will be started automatically at exit (i.e. at the end of the script).
|
| 251 |
+
|
| 252 |
+
```python
|
| 253 |
+
from huggingface_hub import webhook_endpoint, WebhookPayload
|
| 254 |
+
|
| 255 |
+
@webhook_endpoint
|
| 256 |
+
async def trigger_training(payload: WebhookPayload):
|
| 257 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 258 |
+
# Trigger a training job if a dataset is updated
|
| 259 |
+
...
|
| 260 |
+
|
| 261 |
+
# Server is automatically started at the end of the script.
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
Advanced usage: register a function as a webhook endpoint and start the server manually. This is useful if you
|
| 265 |
+
are running it in a notebook.
|
| 266 |
+
|
| 267 |
+
```python
|
| 268 |
+
from huggingface_hub import webhook_endpoint, WebhookPayload
|
| 269 |
+
|
| 270 |
+
@webhook_endpoint
|
| 271 |
+
async def trigger_training(payload: WebhookPayload):
|
| 272 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 273 |
+
# Trigger a training job if a dataset is updated
|
| 274 |
+
...
|
| 275 |
+
|
| 276 |
+
# Start the server manually
|
| 277 |
+
trigger_training.launch()
|
| 278 |
+
```
|
| 279 |
+
"""
|
| 280 |
+
if callable(path):
|
| 281 |
+
# If path is a function, it means it was used as a decorator without arguments
|
| 282 |
+
return webhook_endpoint()(path)
|
| 283 |
+
|
| 284 |
+
@wraps(WebhooksServer.add_webhook)
|
| 285 |
+
def _inner(func: Callable) -> Callable:
|
| 286 |
+
app = _get_global_app()
|
| 287 |
+
app.add_webhook(path)(func)
|
| 288 |
+
if len(app.registered_webhooks) == 1:
|
| 289 |
+
# Register `app.launch` to run at exit (only once)
|
| 290 |
+
atexit.register(app.launch)
|
| 291 |
+
|
| 292 |
+
@wraps(app.launch)
|
| 293 |
+
def _launch_now():
|
| 294 |
+
# Run the app directly (without waiting atexit)
|
| 295 |
+
atexit.unregister(app.launch)
|
| 296 |
+
app.launch()
|
| 297 |
+
|
| 298 |
+
func.launch = _launch_now # type: ignore
|
| 299 |
+
return func
|
| 300 |
+
|
| 301 |
+
return _inner
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def _get_global_app() -> WebhooksServer:
|
| 305 |
+
global _global_app
|
| 306 |
+
if _global_app is None:
|
| 307 |
+
_global_app = WebhooksServer()
|
| 308 |
+
return _global_app
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def _warn_on_empty_secret(webhook_secret: Optional[str]) -> None:
|
| 312 |
+
if webhook_secret is None:
|
| 313 |
+
print("Webhook secret is not defined. This means your webhook endpoints will be open to everyone.")
|
| 314 |
+
print(
|
| 315 |
+
"To add a secret, set `WEBHOOK_SECRET` as environment variable or pass it at initialization: "
|
| 316 |
+
"\n\t`app = WebhooksServer(webhook_secret='my_secret', ...)`"
|
| 317 |
+
)
|
| 318 |
+
print(
|
| 319 |
+
"For more details about webhook secrets, please refer to"
|
| 320 |
+
" https://huggingface.co/docs/hub/webhooks#webhook-secret."
|
| 321 |
+
)
|
| 322 |
+
else:
|
| 323 |
+
print("Webhook secret is correctly defined.")
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def _get_webhook_doc_url(webhook_name: str, webhook_path: str) -> str:
|
| 327 |
+
"""Returns the anchor to a given webhook in the docs (experimental)"""
|
| 328 |
+
return "/docs#/default/" + webhook_name + webhook_path.replace("/", "_") + "_post"
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def _wrap_webhook_to_check_secret(func: Callable, webhook_secret: str) -> Callable:
|
| 332 |
+
"""Wraps a webhook function to check the webhook secret before calling the function.
|
| 333 |
+
|
| 334 |
+
This is a hacky way to add the `request` parameter to the function signature. Since FastAPI based itself on route
|
| 335 |
+
parameters to inject the values to the function, we need to hack the function signature to retrieve the `Request`
|
| 336 |
+
object (and hence the headers). A far cleaner solution would be to use a middleware. However, since
|
| 337 |
+
`fastapi==0.90.1`, a middleware cannot be added once the app has started. And since the FastAPI app is started by
|
| 338 |
+
Gradio internals (and not by us), we cannot add a middleware.
|
| 339 |
+
|
| 340 |
+
This method is called only when a secret has been defined by the user. If a request is sent without the
|
| 341 |
+
"x-webhook-secret", the function will return a 401 error (unauthorized). If the header is sent but is incorrect,
|
| 342 |
+
the function will return a 403 error (forbidden).
|
| 343 |
+
|
| 344 |
+
Inspired by https://stackoverflow.com/a/33112180.
|
| 345 |
+
"""
|
| 346 |
+
initial_sig = inspect.signature(func)
|
| 347 |
+
|
| 348 |
+
@wraps(func)
|
| 349 |
+
async def _protected_func(request: Request, **kwargs):
|
| 350 |
+
request_secret = request.headers.get("x-webhook-secret")
|
| 351 |
+
if request_secret is None:
|
| 352 |
+
return JSONResponse({"error": "x-webhook-secret header not set."}, status_code=401)
|
| 353 |
+
if request_secret != webhook_secret:
|
| 354 |
+
return JSONResponse({"error": "Invalid webhook secret."}, status_code=403)
|
| 355 |
+
|
| 356 |
+
# Inject `request` in kwargs if required
|
| 357 |
+
if "request" in initial_sig.parameters:
|
| 358 |
+
kwargs["request"] = request
|
| 359 |
+
|
| 360 |
+
# Handle both sync and async routes
|
| 361 |
+
if inspect.iscoroutinefunction(func):
|
| 362 |
+
return await func(**kwargs)
|
| 363 |
+
else:
|
| 364 |
+
return func(**kwargs)
|
| 365 |
+
|
| 366 |
+
# Update signature to include request
|
| 367 |
+
if "request" not in initial_sig.parameters:
|
| 368 |
+
_protected_func.__signature__ = initial_sig.replace( # type: ignore
|
| 369 |
+
parameters=(
|
| 370 |
+
inspect.Parameter(name="request", kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Request),
|
| 371 |
+
)
|
| 372 |
+
+ tuple(initial_sig.parameters.values())
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
# Return protected route
|
| 376 |
+
return _protected_func
|
venv/lib/python3.13/site-packages/huggingface_hub/community.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data structures to interact with Discussions and Pull Requests on the Hub.
|
| 3 |
+
|
| 4 |
+
See [the Discussions and Pull Requests guide](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)
|
| 5 |
+
for more information on Pull Requests, Discussions, and the community tab.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import List, Literal, Optional, TypedDict, Union
|
| 11 |
+
|
| 12 |
+
from . import constants
|
| 13 |
+
from .utils import parse_datetime
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DiscussionStatus = Literal["open", "closed", "merged", "draft"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class Discussion:
|
| 21 |
+
"""
|
| 22 |
+
A Discussion or Pull Request on the Hub.
|
| 23 |
+
|
| 24 |
+
This dataclass is not intended to be instantiated directly.
|
| 25 |
+
|
| 26 |
+
Attributes:
|
| 27 |
+
title (`str`):
|
| 28 |
+
The title of the Discussion / Pull Request
|
| 29 |
+
status (`str`):
|
| 30 |
+
The status of the Discussion / Pull Request.
|
| 31 |
+
It must be one of:
|
| 32 |
+
* `"open"`
|
| 33 |
+
* `"closed"`
|
| 34 |
+
* `"merged"` (only for Pull Requests )
|
| 35 |
+
* `"draft"` (only for Pull Requests )
|
| 36 |
+
num (`int`):
|
| 37 |
+
The number of the Discussion / Pull Request.
|
| 38 |
+
repo_id (`str`):
|
| 39 |
+
The id (`"{namespace}/{repo_name}"`) of the repo on which
|
| 40 |
+
the Discussion / Pull Request was open.
|
| 41 |
+
repo_type (`str`):
|
| 42 |
+
The type of the repo on which the Discussion / Pull Request was open.
|
| 43 |
+
Possible values are: `"model"`, `"dataset"`, `"space"`.
|
| 44 |
+
author (`str`):
|
| 45 |
+
The username of the Discussion / Pull Request author.
|
| 46 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 47 |
+
is_pull_request (`bool`):
|
| 48 |
+
Whether or not this is a Pull Request.
|
| 49 |
+
created_at (`datetime`):
|
| 50 |
+
The `datetime` of creation of the Discussion / Pull Request.
|
| 51 |
+
endpoint (`str`):
|
| 52 |
+
Endpoint of the Hub. Default is https://huggingface.co.
|
| 53 |
+
git_reference (`str`, *optional*):
|
| 54 |
+
(property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
|
| 55 |
+
url (`str`):
|
| 56 |
+
(property) URL of the discussion on the Hub.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
title: str
|
| 60 |
+
status: DiscussionStatus
|
| 61 |
+
num: int
|
| 62 |
+
repo_id: str
|
| 63 |
+
repo_type: str
|
| 64 |
+
author: str
|
| 65 |
+
is_pull_request: bool
|
| 66 |
+
created_at: datetime
|
| 67 |
+
endpoint: str
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def git_reference(self) -> Optional[str]:
|
| 71 |
+
"""
|
| 72 |
+
If this is a Pull Request , returns the git reference to which changes can be pushed.
|
| 73 |
+
Returns `None` otherwise.
|
| 74 |
+
"""
|
| 75 |
+
if self.is_pull_request:
|
| 76 |
+
return f"refs/pr/{self.num}"
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def url(self) -> str:
|
| 81 |
+
"""Returns the URL of the discussion on the Hub."""
|
| 82 |
+
if self.repo_type is None or self.repo_type == constants.REPO_TYPE_MODEL:
|
| 83 |
+
return f"{self.endpoint}/{self.repo_id}/discussions/{self.num}"
|
| 84 |
+
return f"{self.endpoint}/{self.repo_type}s/{self.repo_id}/discussions/{self.num}"
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class DiscussionWithDetails(Discussion):
|
| 89 |
+
"""
|
| 90 |
+
Subclass of [`Discussion`].
|
| 91 |
+
|
| 92 |
+
Attributes:
|
| 93 |
+
title (`str`):
|
| 94 |
+
The title of the Discussion / Pull Request
|
| 95 |
+
status (`str`):
|
| 96 |
+
The status of the Discussion / Pull Request.
|
| 97 |
+
It can be one of:
|
| 98 |
+
* `"open"`
|
| 99 |
+
* `"closed"`
|
| 100 |
+
* `"merged"` (only for Pull Requests )
|
| 101 |
+
* `"draft"` (only for Pull Requests )
|
| 102 |
+
num (`int`):
|
| 103 |
+
The number of the Discussion / Pull Request.
|
| 104 |
+
repo_id (`str`):
|
| 105 |
+
The id (`"{namespace}/{repo_name}"`) of the repo on which
|
| 106 |
+
the Discussion / Pull Request was open.
|
| 107 |
+
repo_type (`str`):
|
| 108 |
+
The type of the repo on which the Discussion / Pull Request was open.
|
| 109 |
+
Possible values are: `"model"`, `"dataset"`, `"space"`.
|
| 110 |
+
author (`str`):
|
| 111 |
+
The username of the Discussion / Pull Request author.
|
| 112 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 113 |
+
is_pull_request (`bool`):
|
| 114 |
+
Whether or not this is a Pull Request.
|
| 115 |
+
created_at (`datetime`):
|
| 116 |
+
The `datetime` of creation of the Discussion / Pull Request.
|
| 117 |
+
events (`list` of [`DiscussionEvent`])
|
| 118 |
+
The list of [`DiscussionEvents`] in this Discussion or Pull Request.
|
| 119 |
+
conflicting_files (`Union[List[str], bool, None]`, *optional*):
|
| 120 |
+
A list of conflicting files if this is a Pull Request.
|
| 121 |
+
`None` if `self.is_pull_request` is `False`.
|
| 122 |
+
`True` if there are conflicting files but the list can't be retrieved.
|
| 123 |
+
target_branch (`str`, *optional*):
|
| 124 |
+
The branch into which changes are to be merged if this is a
|
| 125 |
+
Pull Request . `None` if `self.is_pull_request` is `False`.
|
| 126 |
+
merge_commit_oid (`str`, *optional*):
|
| 127 |
+
If this is a merged Pull Request , this is set to the OID / SHA of
|
| 128 |
+
the merge commit, `None` otherwise.
|
| 129 |
+
diff (`str`, *optional*):
|
| 130 |
+
The git diff if this is a Pull Request , `None` otherwise.
|
| 131 |
+
endpoint (`str`):
|
| 132 |
+
Endpoint of the Hub. Default is https://huggingface.co.
|
| 133 |
+
git_reference (`str`, *optional*):
|
| 134 |
+
(property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
|
| 135 |
+
url (`str`):
|
| 136 |
+
(property) URL of the discussion on the Hub.
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
events: List["DiscussionEvent"]
|
| 140 |
+
conflicting_files: Union[List[str], bool, None]
|
| 141 |
+
target_branch: Optional[str]
|
| 142 |
+
merge_commit_oid: Optional[str]
|
| 143 |
+
diff: Optional[str]
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class DiscussionEventArgs(TypedDict):
|
| 147 |
+
id: str
|
| 148 |
+
type: str
|
| 149 |
+
created_at: datetime
|
| 150 |
+
author: str
|
| 151 |
+
_event: dict
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@dataclass
|
| 155 |
+
class DiscussionEvent:
|
| 156 |
+
"""
|
| 157 |
+
An event in a Discussion or Pull Request.
|
| 158 |
+
|
| 159 |
+
Use concrete classes:
|
| 160 |
+
* [`DiscussionComment`]
|
| 161 |
+
* [`DiscussionStatusChange`]
|
| 162 |
+
* [`DiscussionCommit`]
|
| 163 |
+
* [`DiscussionTitleChange`]
|
| 164 |
+
|
| 165 |
+
Attributes:
|
| 166 |
+
id (`str`):
|
| 167 |
+
The ID of the event. An hexadecimal string.
|
| 168 |
+
type (`str`):
|
| 169 |
+
The type of the event.
|
| 170 |
+
created_at (`datetime`):
|
| 171 |
+
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
| 172 |
+
object holding the creation timestamp for the event.
|
| 173 |
+
author (`str`):
|
| 174 |
+
The username of the Discussion / Pull Request author.
|
| 175 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
id: str
|
| 179 |
+
type: str
|
| 180 |
+
created_at: datetime
|
| 181 |
+
author: str
|
| 182 |
+
|
| 183 |
+
_event: dict
|
| 184 |
+
"""Stores the original event data, in case we need to access it later."""
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
@dataclass
|
| 188 |
+
class DiscussionComment(DiscussionEvent):
|
| 189 |
+
"""A comment in a Discussion / Pull Request.
|
| 190 |
+
|
| 191 |
+
Subclass of [`DiscussionEvent`].
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
Attributes:
|
| 195 |
+
id (`str`):
|
| 196 |
+
The ID of the event. An hexadecimal string.
|
| 197 |
+
type (`str`):
|
| 198 |
+
The type of the event.
|
| 199 |
+
created_at (`datetime`):
|
| 200 |
+
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
| 201 |
+
object holding the creation timestamp for the event.
|
| 202 |
+
author (`str`):
|
| 203 |
+
The username of the Discussion / Pull Request author.
|
| 204 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 205 |
+
content (`str`):
|
| 206 |
+
The raw markdown content of the comment. Mentions, links and images are not rendered.
|
| 207 |
+
edited (`bool`):
|
| 208 |
+
Whether or not this comment has been edited.
|
| 209 |
+
hidden (`bool`):
|
| 210 |
+
Whether or not this comment has been hidden.
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
content: str
|
| 214 |
+
edited: bool
|
| 215 |
+
hidden: bool
|
| 216 |
+
|
| 217 |
+
@property
|
| 218 |
+
def rendered(self) -> str:
|
| 219 |
+
"""The rendered comment, as a HTML string"""
|
| 220 |
+
return self._event["data"]["latest"]["html"]
|
| 221 |
+
|
| 222 |
+
@property
|
| 223 |
+
def last_edited_at(self) -> datetime:
|
| 224 |
+
"""The last edit time, as a `datetime` object."""
|
| 225 |
+
return parse_datetime(self._event["data"]["latest"]["updatedAt"])
|
| 226 |
+
|
| 227 |
+
@property
|
| 228 |
+
def last_edited_by(self) -> str:
|
| 229 |
+
"""The last edit time, as a `datetime` object."""
|
| 230 |
+
return self._event["data"]["latest"].get("author", {}).get("name", "deleted")
|
| 231 |
+
|
| 232 |
+
@property
|
| 233 |
+
def edit_history(self) -> List[dict]:
|
| 234 |
+
"""The edit history of the comment"""
|
| 235 |
+
return self._event["data"]["history"]
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def number_of_edits(self) -> int:
|
| 239 |
+
return len(self.edit_history)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
@dataclass
|
| 243 |
+
class DiscussionStatusChange(DiscussionEvent):
|
| 244 |
+
"""A change of status in a Discussion / Pull Request.
|
| 245 |
+
|
| 246 |
+
Subclass of [`DiscussionEvent`].
|
| 247 |
+
|
| 248 |
+
Attributes:
|
| 249 |
+
id (`str`):
|
| 250 |
+
The ID of the event. An hexadecimal string.
|
| 251 |
+
type (`str`):
|
| 252 |
+
The type of the event.
|
| 253 |
+
created_at (`datetime`):
|
| 254 |
+
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
| 255 |
+
object holding the creation timestamp for the event.
|
| 256 |
+
author (`str`):
|
| 257 |
+
The username of the Discussion / Pull Request author.
|
| 258 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 259 |
+
new_status (`str`):
|
| 260 |
+
The status of the Discussion / Pull Request after the change.
|
| 261 |
+
It can be one of:
|
| 262 |
+
* `"open"`
|
| 263 |
+
* `"closed"`
|
| 264 |
+
* `"merged"` (only for Pull Requests )
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
new_status: str
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
@dataclass
|
| 271 |
+
class DiscussionCommit(DiscussionEvent):
|
| 272 |
+
"""A commit in a Pull Request.
|
| 273 |
+
|
| 274 |
+
Subclass of [`DiscussionEvent`].
|
| 275 |
+
|
| 276 |
+
Attributes:
|
| 277 |
+
id (`str`):
|
| 278 |
+
The ID of the event. An hexadecimal string.
|
| 279 |
+
type (`str`):
|
| 280 |
+
The type of the event.
|
| 281 |
+
created_at (`datetime`):
|
| 282 |
+
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
| 283 |
+
object holding the creation timestamp for the event.
|
| 284 |
+
author (`str`):
|
| 285 |
+
The username of the Discussion / Pull Request author.
|
| 286 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 287 |
+
summary (`str`):
|
| 288 |
+
The summary of the commit.
|
| 289 |
+
oid (`str`):
|
| 290 |
+
The OID / SHA of the commit, as a hexadecimal string.
|
| 291 |
+
"""
|
| 292 |
+
|
| 293 |
+
summary: str
|
| 294 |
+
oid: str
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
@dataclass
|
| 298 |
+
class DiscussionTitleChange(DiscussionEvent):
|
| 299 |
+
"""A rename event in a Discussion / Pull Request.
|
| 300 |
+
|
| 301 |
+
Subclass of [`DiscussionEvent`].
|
| 302 |
+
|
| 303 |
+
Attributes:
|
| 304 |
+
id (`str`):
|
| 305 |
+
The ID of the event. An hexadecimal string.
|
| 306 |
+
type (`str`):
|
| 307 |
+
The type of the event.
|
| 308 |
+
created_at (`datetime`):
|
| 309 |
+
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
| 310 |
+
object holding the creation timestamp for the event.
|
| 311 |
+
author (`str`):
|
| 312 |
+
The username of the Discussion / Pull Request author.
|
| 313 |
+
Can be `"deleted"` if the user has been deleted since.
|
| 314 |
+
old_title (`str`):
|
| 315 |
+
The previous title for the Discussion / Pull Request.
|
| 316 |
+
new_title (`str`):
|
| 317 |
+
The new title.
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
old_title: str
|
| 321 |
+
new_title: str
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def deserialize_event(event: dict) -> DiscussionEvent:
|
| 325 |
+
"""Instantiates a [`DiscussionEvent`] from a dict"""
|
| 326 |
+
event_id: str = event["id"]
|
| 327 |
+
event_type: str = event["type"]
|
| 328 |
+
created_at = parse_datetime(event["createdAt"])
|
| 329 |
+
|
| 330 |
+
common_args: DiscussionEventArgs = {
|
| 331 |
+
"id": event_id,
|
| 332 |
+
"type": event_type,
|
| 333 |
+
"created_at": created_at,
|
| 334 |
+
"author": event.get("author", {}).get("name", "deleted"),
|
| 335 |
+
"_event": event,
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
if event_type == "comment":
|
| 339 |
+
return DiscussionComment(
|
| 340 |
+
**common_args,
|
| 341 |
+
edited=event["data"]["edited"],
|
| 342 |
+
hidden=event["data"]["hidden"],
|
| 343 |
+
content=event["data"]["latest"]["raw"],
|
| 344 |
+
)
|
| 345 |
+
if event_type == "status-change":
|
| 346 |
+
return DiscussionStatusChange(
|
| 347 |
+
**common_args,
|
| 348 |
+
new_status=event["data"]["status"],
|
| 349 |
+
)
|
| 350 |
+
if event_type == "commit":
|
| 351 |
+
return DiscussionCommit(
|
| 352 |
+
**common_args,
|
| 353 |
+
summary=event["data"]["subject"],
|
| 354 |
+
oid=event["data"]["oid"],
|
| 355 |
+
)
|
| 356 |
+
if event_type == "title-change":
|
| 357 |
+
return DiscussionTitleChange(
|
| 358 |
+
**common_args,
|
| 359 |
+
old_title=event["data"]["from"],
|
| 360 |
+
new_title=event["data"]["to"],
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
return DiscussionEvent(**common_args)
|
venv/lib/python3.13/site-packages/huggingface_hub/constants.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import typing
|
| 4 |
+
from typing import Literal, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# Possible values for env variables
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
| 11 |
+
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _is_true(value: Optional[str]) -> bool:
|
| 15 |
+
if value is None:
|
| 16 |
+
return False
|
| 17 |
+
return value.upper() in ENV_VARS_TRUE_VALUES
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _as_int(value: Optional[str]) -> Optional[int]:
|
| 21 |
+
if value is None:
|
| 22 |
+
return None
|
| 23 |
+
return int(value)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Constants for file downloads
|
| 27 |
+
|
| 28 |
+
PYTORCH_WEIGHTS_NAME = "pytorch_model.bin"
|
| 29 |
+
TF2_WEIGHTS_NAME = "tf_model.h5"
|
| 30 |
+
TF_WEIGHTS_NAME = "model.ckpt"
|
| 31 |
+
FLAX_WEIGHTS_NAME = "flax_model.msgpack"
|
| 32 |
+
CONFIG_NAME = "config.json"
|
| 33 |
+
REPOCARD_NAME = "README.md"
|
| 34 |
+
DEFAULT_ETAG_TIMEOUT = 10
|
| 35 |
+
DEFAULT_DOWNLOAD_TIMEOUT = 10
|
| 36 |
+
DEFAULT_REQUEST_TIMEOUT = 10
|
| 37 |
+
DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
|
| 38 |
+
HF_TRANSFER_CONCURRENCY = 100
|
| 39 |
+
MAX_HTTP_DOWNLOAD_SIZE = 50 * 1000 * 1000 * 1000 # 50 GB
|
| 40 |
+
|
| 41 |
+
# Constants for serialization
|
| 42 |
+
|
| 43 |
+
PYTORCH_WEIGHTS_FILE_PATTERN = "pytorch_model{suffix}.bin" # Unsafe pickle: use safetensors instead
|
| 44 |
+
SAFETENSORS_WEIGHTS_FILE_PATTERN = "model{suffix}.safetensors"
|
| 45 |
+
TF2_WEIGHTS_FILE_PATTERN = "tf_model{suffix}.h5"
|
| 46 |
+
|
| 47 |
+
# Constants for safetensors repos
|
| 48 |
+
|
| 49 |
+
SAFETENSORS_SINGLE_FILE = "model.safetensors"
|
| 50 |
+
SAFETENSORS_INDEX_FILE = "model.safetensors.index.json"
|
| 51 |
+
SAFETENSORS_MAX_HEADER_LENGTH = 25_000_000
|
| 52 |
+
|
| 53 |
+
# Timeout of aquiring file lock and logging the attempt
|
| 54 |
+
FILELOCK_LOG_EVERY_SECONDS = 10
|
| 55 |
+
|
| 56 |
+
# Git-related constants
|
| 57 |
+
|
| 58 |
+
DEFAULT_REVISION = "main"
|
| 59 |
+
REGEX_COMMIT_OID = re.compile(r"[A-Fa-f0-9]{5,40}")
|
| 60 |
+
|
| 61 |
+
HUGGINGFACE_CO_URL_HOME = "https://huggingface.co/"
|
| 62 |
+
|
| 63 |
+
_staging_mode = _is_true(os.environ.get("HUGGINGFACE_CO_STAGING"))
|
| 64 |
+
|
| 65 |
+
_HF_DEFAULT_ENDPOINT = "https://huggingface.co"
|
| 66 |
+
_HF_DEFAULT_STAGING_ENDPOINT = "https://hub-ci.huggingface.co"
|
| 67 |
+
ENDPOINT = os.getenv("HF_ENDPOINT", _HF_DEFAULT_ENDPOINT).rstrip("/")
|
| 68 |
+
HUGGINGFACE_CO_URL_TEMPLATE = ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
|
| 69 |
+
|
| 70 |
+
if _staging_mode:
|
| 71 |
+
ENDPOINT = _HF_DEFAULT_STAGING_ENDPOINT
|
| 72 |
+
HUGGINGFACE_CO_URL_TEMPLATE = _HF_DEFAULT_STAGING_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
|
| 73 |
+
|
| 74 |
+
HUGGINGFACE_HEADER_X_REPO_COMMIT = "X-Repo-Commit"
|
| 75 |
+
HUGGINGFACE_HEADER_X_LINKED_ETAG = "X-Linked-Etag"
|
| 76 |
+
HUGGINGFACE_HEADER_X_LINKED_SIZE = "X-Linked-Size"
|
| 77 |
+
HUGGINGFACE_HEADER_X_BILL_TO = "X-HF-Bill-To"
|
| 78 |
+
|
| 79 |
+
INFERENCE_ENDPOINT = os.environ.get("HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co")
|
| 80 |
+
|
| 81 |
+
# See https://huggingface.co/docs/inference-endpoints/index
|
| 82 |
+
INFERENCE_ENDPOINTS_ENDPOINT = "https://api.endpoints.huggingface.cloud/v2"
|
| 83 |
+
INFERENCE_CATALOG_ENDPOINT = "https://endpoints.huggingface.co/api/catalog"
|
| 84 |
+
|
| 85 |
+
# See https://api.endpoints.huggingface.cloud/#post-/v2/endpoint/-namespace-
|
| 86 |
+
INFERENCE_ENDPOINT_IMAGE_KEYS = [
|
| 87 |
+
"custom",
|
| 88 |
+
"huggingface",
|
| 89 |
+
"huggingfaceNeuron",
|
| 90 |
+
"llamacpp",
|
| 91 |
+
"tei",
|
| 92 |
+
"tgi",
|
| 93 |
+
"tgiNeuron",
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
# Proxy for third-party providers
|
| 97 |
+
INFERENCE_PROXY_TEMPLATE = "https://router.huggingface.co/{provider}"
|
| 98 |
+
|
| 99 |
+
REPO_ID_SEPARATOR = "--"
|
| 100 |
+
# ^ this substring is not allowed in repo_ids on hf.co
|
| 101 |
+
# and is the canonical one we use for serialization of repo ids elsewhere.
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
REPO_TYPE_DATASET = "dataset"
|
| 105 |
+
REPO_TYPE_SPACE = "space"
|
| 106 |
+
REPO_TYPE_MODEL = "model"
|
| 107 |
+
REPO_TYPES = [None, REPO_TYPE_MODEL, REPO_TYPE_DATASET, REPO_TYPE_SPACE]
|
| 108 |
+
SPACES_SDK_TYPES = ["gradio", "streamlit", "docker", "static"]
|
| 109 |
+
|
| 110 |
+
REPO_TYPES_URL_PREFIXES = {
|
| 111 |
+
REPO_TYPE_DATASET: "datasets/",
|
| 112 |
+
REPO_TYPE_SPACE: "spaces/",
|
| 113 |
+
}
|
| 114 |
+
REPO_TYPES_MAPPING = {
|
| 115 |
+
"datasets": REPO_TYPE_DATASET,
|
| 116 |
+
"spaces": REPO_TYPE_SPACE,
|
| 117 |
+
"models": REPO_TYPE_MODEL,
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
DiscussionTypeFilter = Literal["all", "discussion", "pull_request"]
|
| 121 |
+
DISCUSSION_TYPES: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionTypeFilter)
|
| 122 |
+
DiscussionStatusFilter = Literal["all", "open", "closed"]
|
| 123 |
+
DISCUSSION_STATUS: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionStatusFilter)
|
| 124 |
+
|
| 125 |
+
# Webhook subscription types
|
| 126 |
+
WEBHOOK_DOMAIN_T = Literal["repo", "discussions"]
|
| 127 |
+
|
| 128 |
+
# default cache
|
| 129 |
+
default_home = os.path.join(os.path.expanduser("~"), ".cache")
|
| 130 |
+
HF_HOME = os.path.expandvars(
|
| 131 |
+
os.path.expanduser(
|
| 132 |
+
os.getenv(
|
| 133 |
+
"HF_HOME",
|
| 134 |
+
os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "huggingface"),
|
| 135 |
+
)
|
| 136 |
+
)
|
| 137 |
+
)
|
| 138 |
+
hf_cache_home = HF_HOME # for backward compatibility. TODO: remove this in 1.0.0
|
| 139 |
+
|
| 140 |
+
default_cache_path = os.path.join(HF_HOME, "hub")
|
| 141 |
+
default_assets_cache_path = os.path.join(HF_HOME, "assets")
|
| 142 |
+
|
| 143 |
+
# Legacy env variables
|
| 144 |
+
HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path)
|
| 145 |
+
HUGGINGFACE_ASSETS_CACHE = os.getenv("HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path)
|
| 146 |
+
|
| 147 |
+
# New env variables
|
| 148 |
+
HF_HUB_CACHE = os.path.expandvars(
|
| 149 |
+
os.path.expanduser(
|
| 150 |
+
os.getenv(
|
| 151 |
+
"HF_HUB_CACHE",
|
| 152 |
+
HUGGINGFACE_HUB_CACHE,
|
| 153 |
+
)
|
| 154 |
+
)
|
| 155 |
+
)
|
| 156 |
+
HF_ASSETS_CACHE = os.path.expandvars(
|
| 157 |
+
os.path.expanduser(
|
| 158 |
+
os.getenv(
|
| 159 |
+
"HF_ASSETS_CACHE",
|
| 160 |
+
HUGGINGFACE_ASSETS_CACHE,
|
| 161 |
+
)
|
| 162 |
+
)
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE") or os.environ.get("TRANSFORMERS_OFFLINE"))
|
| 166 |
+
|
| 167 |
+
# If set, log level will be set to DEBUG and all requests made to the Hub will be logged
|
| 168 |
+
# as curl commands for reproducibility.
|
| 169 |
+
HF_DEBUG = _is_true(os.environ.get("HF_DEBUG"))
|
| 170 |
+
|
| 171 |
+
# Opt-out from telemetry requests
|
| 172 |
+
HF_HUB_DISABLE_TELEMETRY = (
|
| 173 |
+
_is_true(os.environ.get("HF_HUB_DISABLE_TELEMETRY")) # HF-specific env variable
|
| 174 |
+
or _is_true(os.environ.get("DISABLE_TELEMETRY"))
|
| 175 |
+
or _is_true(os.environ.get("DO_NOT_TRACK")) # https://consoledonottrack.com/
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
HF_TOKEN_PATH = os.path.expandvars(
|
| 179 |
+
os.path.expanduser(
|
| 180 |
+
os.getenv(
|
| 181 |
+
"HF_TOKEN_PATH",
|
| 182 |
+
os.path.join(HF_HOME, "token"),
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
)
|
| 186 |
+
HF_STORED_TOKENS_PATH = os.path.join(os.path.dirname(HF_TOKEN_PATH), "stored_tokens")
|
| 187 |
+
|
| 188 |
+
if _staging_mode:
|
| 189 |
+
# In staging mode, we use a different cache to ensure we don't mix up production and staging data or tokens
|
| 190 |
+
# In practice in `huggingface_hub` tests, we monkeypatch these values with temporary directories. The following
|
| 191 |
+
# lines are only used in third-party libraries tests (e.g. `transformers`, `diffusers`, etc.).
|
| 192 |
+
_staging_home = os.path.join(os.path.expanduser("~"), ".cache", "huggingface_staging")
|
| 193 |
+
HUGGINGFACE_HUB_CACHE = os.path.join(_staging_home, "hub")
|
| 194 |
+
HF_TOKEN_PATH = os.path.join(_staging_home, "token")
|
| 195 |
+
|
| 196 |
+
# Here, `True` will disable progress bars globally without possibility of enabling it
|
| 197 |
+
# programmatically. `False` will enable them without possibility of disabling them.
|
| 198 |
+
# If environment variable is not set (None), then the user is free to enable/disable
|
| 199 |
+
# them programmatically.
|
| 200 |
+
# TL;DR: env variable has priority over code
|
| 201 |
+
__HF_HUB_DISABLE_PROGRESS_BARS = os.environ.get("HF_HUB_DISABLE_PROGRESS_BARS")
|
| 202 |
+
HF_HUB_DISABLE_PROGRESS_BARS: Optional[bool] = (
|
| 203 |
+
_is_true(__HF_HUB_DISABLE_PROGRESS_BARS) if __HF_HUB_DISABLE_PROGRESS_BARS is not None else None
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Disable warning on machines that do not support symlinks (e.g. Windows non-developer)
|
| 207 |
+
HF_HUB_DISABLE_SYMLINKS_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_SYMLINKS_WARNING"))
|
| 208 |
+
|
| 209 |
+
# Disable warning when using experimental features
|
| 210 |
+
HF_HUB_DISABLE_EXPERIMENTAL_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_EXPERIMENTAL_WARNING"))
|
| 211 |
+
|
| 212 |
+
# Disable sending the cached token by default is all HTTP requests to the Hub
|
| 213 |
+
HF_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("HF_HUB_DISABLE_IMPLICIT_TOKEN"))
|
| 214 |
+
|
| 215 |
+
# Enable fast-download using external dependency "hf_transfer"
|
| 216 |
+
# See:
|
| 217 |
+
# - https://pypi.org/project/hf-transfer/
|
| 218 |
+
# - https://github.com/huggingface/hf_transfer (private)
|
| 219 |
+
HF_HUB_ENABLE_HF_TRANSFER: bool = _is_true(os.environ.get("HF_HUB_ENABLE_HF_TRANSFER"))
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# UNUSED
|
| 223 |
+
# We don't use symlinks in local dir anymore.
|
| 224 |
+
HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = (
|
| 225 |
+
_as_int(os.environ.get("HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD")) or 5 * 1024 * 1024
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
# Used to override the etag timeout on a system level
|
| 229 |
+
HF_HUB_ETAG_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT")) or DEFAULT_ETAG_TIMEOUT
|
| 230 |
+
|
| 231 |
+
# Used to override the get request timeout on a system level
|
| 232 |
+
HF_HUB_DOWNLOAD_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_DOWNLOAD_TIMEOUT")) or DEFAULT_DOWNLOAD_TIMEOUT
|
| 233 |
+
|
| 234 |
+
# Allows to add information about the requester in the user-agent (eg. partner name)
|
| 235 |
+
HF_HUB_USER_AGENT_ORIGIN: Optional[str] = os.environ.get("HF_HUB_USER_AGENT_ORIGIN")
|
| 236 |
+
|
| 237 |
+
# List frameworks that are handled by the InferenceAPI service. Useful to scan endpoints and check which models are
|
| 238 |
+
# deployed and running. Since 95% of the models are using the top 4 frameworks listed below, we scan only those by
|
| 239 |
+
# default. We still keep the full list of supported frameworks in case we want to scan all of them.
|
| 240 |
+
MAIN_INFERENCE_API_FRAMEWORKS = [
|
| 241 |
+
"diffusers",
|
| 242 |
+
"sentence-transformers",
|
| 243 |
+
"text-generation-inference",
|
| 244 |
+
"transformers",
|
| 245 |
+
]
|
| 246 |
+
|
| 247 |
+
ALL_INFERENCE_API_FRAMEWORKS = MAIN_INFERENCE_API_FRAMEWORKS + [
|
| 248 |
+
"adapter-transformers",
|
| 249 |
+
"allennlp",
|
| 250 |
+
"asteroid",
|
| 251 |
+
"bertopic",
|
| 252 |
+
"doctr",
|
| 253 |
+
"espnet",
|
| 254 |
+
"fairseq",
|
| 255 |
+
"fastai",
|
| 256 |
+
"fasttext",
|
| 257 |
+
"flair",
|
| 258 |
+
"k2",
|
| 259 |
+
"keras",
|
| 260 |
+
"mindspore",
|
| 261 |
+
"nemo",
|
| 262 |
+
"open_clip",
|
| 263 |
+
"paddlenlp",
|
| 264 |
+
"peft",
|
| 265 |
+
"pyannote-audio",
|
| 266 |
+
"sklearn",
|
| 267 |
+
"spacy",
|
| 268 |
+
"span-marker",
|
| 269 |
+
"speechbrain",
|
| 270 |
+
"stanza",
|
| 271 |
+
"timm",
|
| 272 |
+
]
|
| 273 |
+
|
| 274 |
+
# If OAuth didn't work after 2 redirects, there's likely a third-party cookie issue in the Space iframe view.
|
| 275 |
+
# In this case, we redirect the user to the non-iframe view.
|
| 276 |
+
OAUTH_MAX_REDIRECTS = 2
|
| 277 |
+
|
| 278 |
+
# OAuth-related environment variables injected by the Space
|
| 279 |
+
OAUTH_CLIENT_ID = os.environ.get("OAUTH_CLIENT_ID")
|
| 280 |
+
OAUTH_CLIENT_SECRET = os.environ.get("OAUTH_CLIENT_SECRET")
|
| 281 |
+
OAUTH_SCOPES = os.environ.get("OAUTH_SCOPES")
|
| 282 |
+
OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL")
|
| 283 |
+
|
| 284 |
+
# Xet constants
|
| 285 |
+
HUGGINGFACE_HEADER_X_XET_ENDPOINT = "X-Xet-Cas-Url"
|
| 286 |
+
HUGGINGFACE_HEADER_X_XET_ACCESS_TOKEN = "X-Xet-Access-Token"
|
| 287 |
+
HUGGINGFACE_HEADER_X_XET_EXPIRATION = "X-Xet-Token-Expiration"
|
| 288 |
+
HUGGINGFACE_HEADER_X_XET_HASH = "X-Xet-Hash"
|
| 289 |
+
HUGGINGFACE_HEADER_X_XET_REFRESH_ROUTE = "X-Xet-Refresh-Route"
|
| 290 |
+
HUGGINGFACE_HEADER_LINK_XET_AUTH_KEY = "xet-auth"
|
| 291 |
+
|
| 292 |
+
default_xet_cache_path = os.path.join(HF_HOME, "xet")
|
| 293 |
+
HF_XET_CACHE = os.getenv("HF_XET_CACHE", default_xet_cache_path)
|
| 294 |
+
HF_HUB_DISABLE_XET: bool = _is_true(os.environ.get("HF_HUB_DISABLE_XET"))
|
venv/lib/python3.13/site-packages/huggingface_hub/dataclasses.py
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
from dataclasses import _MISSING_TYPE, MISSING, Field, field, fields
|
| 3 |
+
from functools import wraps
|
| 4 |
+
from typing import (
|
| 5 |
+
Any,
|
| 6 |
+
Callable,
|
| 7 |
+
Dict,
|
| 8 |
+
ForwardRef,
|
| 9 |
+
List,
|
| 10 |
+
Literal,
|
| 11 |
+
Optional,
|
| 12 |
+
Tuple,
|
| 13 |
+
Type,
|
| 14 |
+
TypeVar,
|
| 15 |
+
Union,
|
| 16 |
+
get_args,
|
| 17 |
+
get_origin,
|
| 18 |
+
overload,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
from .errors import (
|
| 22 |
+
StrictDataclassClassValidationError,
|
| 23 |
+
StrictDataclassDefinitionError,
|
| 24 |
+
StrictDataclassFieldValidationError,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
Validator_T = Callable[[Any], None]
|
| 29 |
+
T = TypeVar("T")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# The overload decorator helps type checkers understand the different return types
|
| 33 |
+
@overload
|
| 34 |
+
def strict(cls: Type[T]) -> Type[T]: ...
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@overload
|
| 38 |
+
def strict(*, accept_kwargs: bool = False) -> Callable[[Type[T]], Type[T]]: ...
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def strict(
|
| 42 |
+
cls: Optional[Type[T]] = None, *, accept_kwargs: bool = False
|
| 43 |
+
) -> Union[Type[T], Callable[[Type[T]], Type[T]]]:
|
| 44 |
+
"""
|
| 45 |
+
Decorator to add strict validation to a dataclass.
|
| 46 |
+
|
| 47 |
+
This decorator must be used on top of `@dataclass` to ensure IDEs and static typing tools
|
| 48 |
+
recognize the class as a dataclass.
|
| 49 |
+
|
| 50 |
+
Can be used with or without arguments:
|
| 51 |
+
- `@strict`
|
| 52 |
+
- `@strict(accept_kwargs=True)`
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
cls:
|
| 56 |
+
The class to convert to a strict dataclass.
|
| 57 |
+
accept_kwargs (`bool`, *optional*):
|
| 58 |
+
If True, allows arbitrary keyword arguments in `__init__`. Defaults to False.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
The enhanced dataclass with strict validation on field assignment.
|
| 62 |
+
|
| 63 |
+
Example:
|
| 64 |
+
```py
|
| 65 |
+
>>> from dataclasses import dataclass
|
| 66 |
+
>>> from huggingface_hub.dataclasses import as_validated_field, strict, validated_field
|
| 67 |
+
|
| 68 |
+
>>> @as_validated_field
|
| 69 |
+
>>> def positive_int(value: int):
|
| 70 |
+
... if not value >= 0:
|
| 71 |
+
... raise ValueError(f"Value must be positive, got {value}")
|
| 72 |
+
|
| 73 |
+
>>> @strict(accept_kwargs=True)
|
| 74 |
+
... @dataclass
|
| 75 |
+
... class User:
|
| 76 |
+
... name: str
|
| 77 |
+
... age: int = positive_int(default=10)
|
| 78 |
+
|
| 79 |
+
# Initialize
|
| 80 |
+
>>> User(name="John")
|
| 81 |
+
User(name='John', age=10)
|
| 82 |
+
|
| 83 |
+
# Extra kwargs are accepted
|
| 84 |
+
>>> User(name="John", age=30, lastname="Doe")
|
| 85 |
+
User(name='John', age=30, *lastname='Doe')
|
| 86 |
+
|
| 87 |
+
# Invalid type => raises
|
| 88 |
+
>>> User(name="John", age="30")
|
| 89 |
+
huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age':
|
| 90 |
+
TypeError: Field 'age' expected int, got str (value: '30')
|
| 91 |
+
|
| 92 |
+
# Invalid value => raises
|
| 93 |
+
>>> User(name="John", age=-1)
|
| 94 |
+
huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age':
|
| 95 |
+
ValueError: Value must be positive, got -1
|
| 96 |
+
```
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
def wrap(cls: Type[T]) -> Type[T]:
|
| 100 |
+
if not hasattr(cls, "__dataclass_fields__"):
|
| 101 |
+
raise StrictDataclassDefinitionError(
|
| 102 |
+
f"Class '{cls.__name__}' must be a dataclass before applying @strict."
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# List and store validators
|
| 106 |
+
field_validators: Dict[str, List[Validator_T]] = {}
|
| 107 |
+
for f in fields(cls): # type: ignore [arg-type]
|
| 108 |
+
validators = []
|
| 109 |
+
validators.append(_create_type_validator(f))
|
| 110 |
+
custom_validator = f.metadata.get("validator")
|
| 111 |
+
if custom_validator is not None:
|
| 112 |
+
if not isinstance(custom_validator, list):
|
| 113 |
+
custom_validator = [custom_validator]
|
| 114 |
+
for validator in custom_validator:
|
| 115 |
+
if not _is_validator(validator):
|
| 116 |
+
raise StrictDataclassDefinitionError(
|
| 117 |
+
f"Invalid validator for field '{f.name}': {validator}. Must be a callable taking a single argument."
|
| 118 |
+
)
|
| 119 |
+
validators.extend(custom_validator)
|
| 120 |
+
field_validators[f.name] = validators
|
| 121 |
+
cls.__validators__ = field_validators # type: ignore
|
| 122 |
+
|
| 123 |
+
# Override __setattr__ to validate fields on assignment
|
| 124 |
+
original_setattr = cls.__setattr__
|
| 125 |
+
|
| 126 |
+
def __strict_setattr__(self: Any, name: str, value: Any) -> None:
|
| 127 |
+
"""Custom __setattr__ method for strict dataclasses."""
|
| 128 |
+
# Run all validators
|
| 129 |
+
for validator in self.__validators__.get(name, []):
|
| 130 |
+
try:
|
| 131 |
+
validator(value)
|
| 132 |
+
except (ValueError, TypeError) as e:
|
| 133 |
+
raise StrictDataclassFieldValidationError(field=name, cause=e) from e
|
| 134 |
+
|
| 135 |
+
# If validation passed, set the attribute
|
| 136 |
+
original_setattr(self, name, value)
|
| 137 |
+
|
| 138 |
+
cls.__setattr__ = __strict_setattr__ # type: ignore[method-assign]
|
| 139 |
+
|
| 140 |
+
if accept_kwargs:
|
| 141 |
+
# (optional) Override __init__ to accept arbitrary keyword arguments
|
| 142 |
+
original_init = cls.__init__
|
| 143 |
+
|
| 144 |
+
@wraps(original_init)
|
| 145 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 146 |
+
# Extract only the fields that are part of the dataclass
|
| 147 |
+
dataclass_fields = {f.name for f in fields(cls)} # type: ignore [arg-type]
|
| 148 |
+
standard_kwargs = {k: v for k, v in kwargs.items() if k in dataclass_fields}
|
| 149 |
+
|
| 150 |
+
# Call the original __init__ with standard fields
|
| 151 |
+
original_init(self, **standard_kwargs)
|
| 152 |
+
|
| 153 |
+
# Add any additional kwargs as attributes
|
| 154 |
+
for name, value in kwargs.items():
|
| 155 |
+
if name not in dataclass_fields:
|
| 156 |
+
self.__setattr__(name, value)
|
| 157 |
+
|
| 158 |
+
cls.__init__ = __init__ # type: ignore[method-assign]
|
| 159 |
+
|
| 160 |
+
# (optional) Override __repr__ to include additional kwargs
|
| 161 |
+
original_repr = cls.__repr__
|
| 162 |
+
|
| 163 |
+
@wraps(original_repr)
|
| 164 |
+
def __repr__(self) -> str:
|
| 165 |
+
# Call the original __repr__ to get the standard fields
|
| 166 |
+
standard_repr = original_repr(self)
|
| 167 |
+
|
| 168 |
+
# Get additional kwargs
|
| 169 |
+
additional_kwargs = [
|
| 170 |
+
# add a '*' in front of additional kwargs to let the user know they are not part of the dataclass
|
| 171 |
+
f"*{k}={v!r}"
|
| 172 |
+
for k, v in self.__dict__.items()
|
| 173 |
+
if k not in cls.__dataclass_fields__ # type: ignore [attr-defined]
|
| 174 |
+
]
|
| 175 |
+
additional_repr = ", ".join(additional_kwargs)
|
| 176 |
+
|
| 177 |
+
# Combine both representations
|
| 178 |
+
return f"{standard_repr[:-1]}, {additional_repr})" if additional_kwargs else standard_repr
|
| 179 |
+
|
| 180 |
+
cls.__repr__ = __repr__ # type: ignore [method-assign]
|
| 181 |
+
|
| 182 |
+
# List all public methods starting with `validate_` => class validators.
|
| 183 |
+
class_validators = []
|
| 184 |
+
|
| 185 |
+
for name in dir(cls):
|
| 186 |
+
if not name.startswith("validate_"):
|
| 187 |
+
continue
|
| 188 |
+
method = getattr(cls, name)
|
| 189 |
+
if not callable(method):
|
| 190 |
+
continue
|
| 191 |
+
if len(inspect.signature(method).parameters) != 1:
|
| 192 |
+
raise StrictDataclassDefinitionError(
|
| 193 |
+
f"Class '{cls.__name__}' has a class validator '{name}' that takes more than one argument."
|
| 194 |
+
" Class validators must take only 'self' as an argument. Methods starting with 'validate_'"
|
| 195 |
+
" are considered to be class validators."
|
| 196 |
+
)
|
| 197 |
+
class_validators.append(method)
|
| 198 |
+
|
| 199 |
+
cls.__class_validators__ = class_validators # type: ignore [attr-defined]
|
| 200 |
+
|
| 201 |
+
# Add `validate` method to the class, but first check if it already exists
|
| 202 |
+
def validate(self: T) -> None:
|
| 203 |
+
"""Run class validators on the instance."""
|
| 204 |
+
for validator in cls.__class_validators__: # type: ignore [attr-defined]
|
| 205 |
+
try:
|
| 206 |
+
validator(self)
|
| 207 |
+
except (ValueError, TypeError) as e:
|
| 208 |
+
raise StrictDataclassClassValidationError(validator=validator.__name__, cause=e) from e
|
| 209 |
+
|
| 210 |
+
# Hack to be able to raise if `.validate()` already exists except if it was created by this decorator on a parent class
|
| 211 |
+
# (in which case we just override it)
|
| 212 |
+
validate.__is_defined_by_strict_decorator__ = True # type: ignore [attr-defined]
|
| 213 |
+
|
| 214 |
+
if hasattr(cls, "validate"):
|
| 215 |
+
if not getattr(cls.validate, "__is_defined_by_strict_decorator__", False): # type: ignore [attr-defined]
|
| 216 |
+
raise StrictDataclassDefinitionError(
|
| 217 |
+
f"Class '{cls.__name__}' already implements a method called 'validate'."
|
| 218 |
+
" This method name is reserved when using the @strict decorator on a dataclass."
|
| 219 |
+
" If you want to keep your own method, please rename it."
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
cls.validate = validate # type: ignore
|
| 223 |
+
|
| 224 |
+
# Run class validators after initialization
|
| 225 |
+
initial_init = cls.__init__
|
| 226 |
+
|
| 227 |
+
@wraps(initial_init)
|
| 228 |
+
def init_with_validate(self, *args, **kwargs) -> None:
|
| 229 |
+
"""Run class validators after initialization."""
|
| 230 |
+
initial_init(self, *args, **kwargs) # type: ignore [call-arg]
|
| 231 |
+
cls.validate(self) # type: ignore [attr-defined]
|
| 232 |
+
|
| 233 |
+
setattr(cls, "__init__", init_with_validate)
|
| 234 |
+
|
| 235 |
+
return cls
|
| 236 |
+
|
| 237 |
+
# Return wrapped class or the decorator itself
|
| 238 |
+
return wrap(cls) if cls is not None else wrap
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def validated_field(
|
| 242 |
+
validator: Union[List[Validator_T], Validator_T],
|
| 243 |
+
default: Union[Any, _MISSING_TYPE] = MISSING,
|
| 244 |
+
default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING,
|
| 245 |
+
init: bool = True,
|
| 246 |
+
repr: bool = True,
|
| 247 |
+
hash: Optional[bool] = None,
|
| 248 |
+
compare: bool = True,
|
| 249 |
+
metadata: Optional[Dict] = None,
|
| 250 |
+
**kwargs: Any,
|
| 251 |
+
) -> Any:
|
| 252 |
+
"""
|
| 253 |
+
Create a dataclass field with a custom validator.
|
| 254 |
+
|
| 255 |
+
Useful to apply several checks to a field. If only applying one rule, check out the [`as_validated_field`] decorator.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
validator (`Callable` or `List[Callable]`):
|
| 259 |
+
A method that takes a value as input and raises ValueError/TypeError if the value is invalid.
|
| 260 |
+
Can be a list of validators to apply multiple checks.
|
| 261 |
+
**kwargs:
|
| 262 |
+
Additional arguments to pass to `dataclasses.field()`.
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
A field with the validator attached in metadata
|
| 266 |
+
"""
|
| 267 |
+
if not isinstance(validator, list):
|
| 268 |
+
validator = [validator]
|
| 269 |
+
if metadata is None:
|
| 270 |
+
metadata = {}
|
| 271 |
+
metadata["validator"] = validator
|
| 272 |
+
return field( # type: ignore
|
| 273 |
+
default=default, # type: ignore [arg-type]
|
| 274 |
+
default_factory=default_factory, # type: ignore [arg-type]
|
| 275 |
+
init=init,
|
| 276 |
+
repr=repr,
|
| 277 |
+
hash=hash,
|
| 278 |
+
compare=compare,
|
| 279 |
+
metadata=metadata,
|
| 280 |
+
**kwargs,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def as_validated_field(validator: Validator_T):
|
| 285 |
+
"""
|
| 286 |
+
Decorates a validator function as a [`validated_field`] (i.e. a dataclass field with a custom validator).
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
validator (`Callable`):
|
| 290 |
+
A method that takes a value as input and raises ValueError/TypeError if the value is invalid.
|
| 291 |
+
"""
|
| 292 |
+
|
| 293 |
+
def _inner(
|
| 294 |
+
default: Union[Any, _MISSING_TYPE] = MISSING,
|
| 295 |
+
default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING,
|
| 296 |
+
init: bool = True,
|
| 297 |
+
repr: bool = True,
|
| 298 |
+
hash: Optional[bool] = None,
|
| 299 |
+
compare: bool = True,
|
| 300 |
+
metadata: Optional[Dict] = None,
|
| 301 |
+
**kwargs: Any,
|
| 302 |
+
):
|
| 303 |
+
return validated_field(
|
| 304 |
+
validator,
|
| 305 |
+
default=default,
|
| 306 |
+
default_factory=default_factory,
|
| 307 |
+
init=init,
|
| 308 |
+
repr=repr,
|
| 309 |
+
hash=hash,
|
| 310 |
+
compare=compare,
|
| 311 |
+
metadata=metadata,
|
| 312 |
+
**kwargs,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return _inner
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def type_validator(name: str, value: Any, expected_type: Any) -> None:
|
| 319 |
+
"""Validate that 'value' matches 'expected_type'."""
|
| 320 |
+
origin = get_origin(expected_type)
|
| 321 |
+
args = get_args(expected_type)
|
| 322 |
+
|
| 323 |
+
if expected_type is Any:
|
| 324 |
+
return
|
| 325 |
+
elif validator := _BASIC_TYPE_VALIDATORS.get(origin):
|
| 326 |
+
validator(name, value, args)
|
| 327 |
+
elif isinstance(expected_type, type): # simple types
|
| 328 |
+
_validate_simple_type(name, value, expected_type)
|
| 329 |
+
elif isinstance(expected_type, ForwardRef) or isinstance(expected_type, str):
|
| 330 |
+
return
|
| 331 |
+
else:
|
| 332 |
+
raise TypeError(f"Unsupported type for field '{name}': {expected_type}")
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def _validate_union(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 336 |
+
"""Validate that value matches one of the types in a Union."""
|
| 337 |
+
errors = []
|
| 338 |
+
for t in args:
|
| 339 |
+
try:
|
| 340 |
+
type_validator(name, value, t)
|
| 341 |
+
return # Valid if any type matches
|
| 342 |
+
except TypeError as e:
|
| 343 |
+
errors.append(str(e))
|
| 344 |
+
|
| 345 |
+
raise TypeError(
|
| 346 |
+
f"Field '{name}' with value {repr(value)} doesn't match any type in {args}. Errors: {'; '.join(errors)}"
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def _validate_literal(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 351 |
+
"""Validate Literal type."""
|
| 352 |
+
if value not in args:
|
| 353 |
+
raise TypeError(f"Field '{name}' expected one of {args}, got {value}")
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def _validate_list(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 357 |
+
"""Validate List[T] type."""
|
| 358 |
+
if not isinstance(value, list):
|
| 359 |
+
raise TypeError(f"Field '{name}' expected a list, got {type(value).__name__}")
|
| 360 |
+
|
| 361 |
+
# Validate each item in the list
|
| 362 |
+
item_type = args[0]
|
| 363 |
+
for i, item in enumerate(value):
|
| 364 |
+
try:
|
| 365 |
+
type_validator(f"{name}[{i}]", item, item_type)
|
| 366 |
+
except TypeError as e:
|
| 367 |
+
raise TypeError(f"Invalid item at index {i} in list '{name}'") from e
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def _validate_dict(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 371 |
+
"""Validate Dict[K, V] type."""
|
| 372 |
+
if not isinstance(value, dict):
|
| 373 |
+
raise TypeError(f"Field '{name}' expected a dict, got {type(value).__name__}")
|
| 374 |
+
|
| 375 |
+
# Validate keys and values
|
| 376 |
+
key_type, value_type = args
|
| 377 |
+
for k, v in value.items():
|
| 378 |
+
try:
|
| 379 |
+
type_validator(f"{name}.key", k, key_type)
|
| 380 |
+
type_validator(f"{name}[{k!r}]", v, value_type)
|
| 381 |
+
except TypeError as e:
|
| 382 |
+
raise TypeError(f"Invalid key or value in dict '{name}'") from e
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def _validate_tuple(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 386 |
+
"""Validate Tuple type."""
|
| 387 |
+
if not isinstance(value, tuple):
|
| 388 |
+
raise TypeError(f"Field '{name}' expected a tuple, got {type(value).__name__}")
|
| 389 |
+
|
| 390 |
+
# Handle variable-length tuples: Tuple[T, ...]
|
| 391 |
+
if len(args) == 2 and args[1] is Ellipsis:
|
| 392 |
+
for i, item in enumerate(value):
|
| 393 |
+
try:
|
| 394 |
+
type_validator(f"{name}[{i}]", item, args[0])
|
| 395 |
+
except TypeError as e:
|
| 396 |
+
raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e
|
| 397 |
+
# Handle fixed-length tuples: Tuple[T1, T2, ...]
|
| 398 |
+
elif len(args) != len(value):
|
| 399 |
+
raise TypeError(f"Field '{name}' expected a tuple of length {len(args)}, got {len(value)}")
|
| 400 |
+
else:
|
| 401 |
+
for i, (item, expected) in enumerate(zip(value, args)):
|
| 402 |
+
try:
|
| 403 |
+
type_validator(f"{name}[{i}]", item, expected)
|
| 404 |
+
except TypeError as e:
|
| 405 |
+
raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def _validate_set(name: str, value: Any, args: Tuple[Any, ...]) -> None:
|
| 409 |
+
"""Validate Set[T] type."""
|
| 410 |
+
if not isinstance(value, set):
|
| 411 |
+
raise TypeError(f"Field '{name}' expected a set, got {type(value).__name__}")
|
| 412 |
+
|
| 413 |
+
# Validate each item in the set
|
| 414 |
+
item_type = args[0]
|
| 415 |
+
for i, item in enumerate(value):
|
| 416 |
+
try:
|
| 417 |
+
type_validator(f"{name} item", item, item_type)
|
| 418 |
+
except TypeError as e:
|
| 419 |
+
raise TypeError(f"Invalid item in set '{name}'") from e
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
def _validate_simple_type(name: str, value: Any, expected_type: type) -> None:
|
| 423 |
+
"""Validate simple type (int, str, etc.)."""
|
| 424 |
+
if not isinstance(value, expected_type):
|
| 425 |
+
raise TypeError(
|
| 426 |
+
f"Field '{name}' expected {expected_type.__name__}, got {type(value).__name__} (value: {repr(value)})"
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def _create_type_validator(field: Field) -> Validator_T:
|
| 431 |
+
"""Create a type validator function for a field."""
|
| 432 |
+
# Hacky: we cannot use a lambda here because of reference issues
|
| 433 |
+
|
| 434 |
+
def validator(value: Any) -> None:
|
| 435 |
+
type_validator(field.name, value, field.type)
|
| 436 |
+
|
| 437 |
+
return validator
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def _is_validator(validator: Any) -> bool:
|
| 441 |
+
"""Check if a function is a validator.
|
| 442 |
+
|
| 443 |
+
A validator is a Callable that can be called with a single positional argument.
|
| 444 |
+
The validator can have more arguments with default values.
|
| 445 |
+
|
| 446 |
+
Basically, returns True if `validator(value)` is possible.
|
| 447 |
+
"""
|
| 448 |
+
if not callable(validator):
|
| 449 |
+
return False
|
| 450 |
+
|
| 451 |
+
signature = inspect.signature(validator)
|
| 452 |
+
parameters = list(signature.parameters.values())
|
| 453 |
+
if len(parameters) == 0:
|
| 454 |
+
return False
|
| 455 |
+
if parameters[0].kind not in (
|
| 456 |
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
| 457 |
+
inspect.Parameter.POSITIONAL_ONLY,
|
| 458 |
+
inspect.Parameter.VAR_POSITIONAL,
|
| 459 |
+
):
|
| 460 |
+
return False
|
| 461 |
+
for parameter in parameters[1:]:
|
| 462 |
+
if parameter.default == inspect.Parameter.empty:
|
| 463 |
+
return False
|
| 464 |
+
return True
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
_BASIC_TYPE_VALIDATORS = {
|
| 468 |
+
Union: _validate_union,
|
| 469 |
+
Literal: _validate_literal,
|
| 470 |
+
list: _validate_list,
|
| 471 |
+
dict: _validate_dict,
|
| 472 |
+
tuple: _validate_tuple,
|
| 473 |
+
set: _validate_set,
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
__all__ = [
|
| 478 |
+
"strict",
|
| 479 |
+
"validated_field",
|
| 480 |
+
"Validator_T",
|
| 481 |
+
"StrictDataclassClassValidationError",
|
| 482 |
+
"StrictDataclassDefinitionError",
|
| 483 |
+
"StrictDataclassFieldValidationError",
|
| 484 |
+
]
|
venv/lib/python3.13/site-packages/huggingface_hub/errors.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Contains all custom errors."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
|
| 6 |
+
from requests import HTTPError, Response
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# CACHE ERRORS
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class CacheNotFound(Exception):
|
| 13 |
+
"""Exception thrown when the Huggingface cache is not found."""
|
| 14 |
+
|
| 15 |
+
cache_dir: Union[str, Path]
|
| 16 |
+
|
| 17 |
+
def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs):
|
| 18 |
+
super().__init__(msg, *args, **kwargs)
|
| 19 |
+
self.cache_dir = cache_dir
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CorruptedCacheException(Exception):
|
| 23 |
+
"""Exception for any unexpected structure in the Huggingface cache-system."""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# HEADERS ERRORS
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class LocalTokenNotFoundError(EnvironmentError):
|
| 30 |
+
"""Raised if local token is required but not found."""
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# HTTP ERRORS
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class OfflineModeIsEnabled(ConnectionError):
|
| 37 |
+
"""Raised when a request is made but `HF_HUB_OFFLINE=1` is set as environment variable."""
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class HfHubHTTPError(HTTPError):
|
| 41 |
+
"""
|
| 42 |
+
HTTPError to inherit from for any custom HTTP Error raised in HF Hub.
|
| 43 |
+
|
| 44 |
+
Any HTTPError is converted at least into a `HfHubHTTPError`. If some information is
|
| 45 |
+
sent back by the server, it will be added to the error message.
|
| 46 |
+
|
| 47 |
+
Added details:
|
| 48 |
+
- Request id from "X-Request-Id" header if exists. If not, fallback to "X-Amzn-Trace-Id" header if exists.
|
| 49 |
+
- Server error message from the header "X-Error-Message".
|
| 50 |
+
- Server error message if we can found one in the response body.
|
| 51 |
+
|
| 52 |
+
Example:
|
| 53 |
+
```py
|
| 54 |
+
import requests
|
| 55 |
+
from huggingface_hub.utils import get_session, hf_raise_for_status, HfHubHTTPError
|
| 56 |
+
|
| 57 |
+
response = get_session().post(...)
|
| 58 |
+
try:
|
| 59 |
+
hf_raise_for_status(response)
|
| 60 |
+
except HfHubHTTPError as e:
|
| 61 |
+
print(str(e)) # formatted message
|
| 62 |
+
e.request_id, e.server_message # details returned by server
|
| 63 |
+
|
| 64 |
+
# Complete the error message with additional information once it's raised
|
| 65 |
+
e.append_to_message("\n`create_commit` expects the repository to exist.")
|
| 66 |
+
raise
|
| 67 |
+
```
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
def __init__(self, message: str, response: Optional[Response] = None, *, server_message: Optional[str] = None):
|
| 71 |
+
self.request_id = (
|
| 72 |
+
response.headers.get("x-request-id") or response.headers.get("X-Amzn-Trace-Id")
|
| 73 |
+
if response is not None
|
| 74 |
+
else None
|
| 75 |
+
)
|
| 76 |
+
self.server_message = server_message
|
| 77 |
+
|
| 78 |
+
super().__init__(
|
| 79 |
+
message,
|
| 80 |
+
response=response, # type: ignore [arg-type]
|
| 81 |
+
request=response.request if response is not None else None, # type: ignore [arg-type]
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def append_to_message(self, additional_message: str) -> None:
|
| 85 |
+
"""Append additional information to the `HfHubHTTPError` initial message."""
|
| 86 |
+
self.args = (self.args[0] + additional_message,) + self.args[1:]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# INFERENCE CLIENT ERRORS
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class InferenceTimeoutError(HTTPError, TimeoutError):
|
| 93 |
+
"""Error raised when a model is unavailable or the request times out."""
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# INFERENCE ENDPOINT ERRORS
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class InferenceEndpointError(Exception):
|
| 100 |
+
"""Generic exception when dealing with Inference Endpoints."""
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class InferenceEndpointTimeoutError(InferenceEndpointError, TimeoutError):
|
| 104 |
+
"""Exception for timeouts while waiting for Inference Endpoint."""
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# SAFETENSORS ERRORS
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class SafetensorsParsingError(Exception):
|
| 111 |
+
"""Raised when failing to parse a safetensors file metadata.
|
| 112 |
+
|
| 113 |
+
This can be the case if the file is not a safetensors file or does not respect the specification.
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class NotASafetensorsRepoError(Exception):
|
| 118 |
+
"""Raised when a repo is not a Safetensors repo i.e. doesn't have either a `model.safetensors` or a
|
| 119 |
+
`model.safetensors.index.json` file.
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# TEXT GENERATION ERRORS
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class TextGenerationError(HTTPError):
|
| 127 |
+
"""Generic error raised if text-generation went wrong."""
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# Text Generation Inference Errors
|
| 131 |
+
class ValidationError(TextGenerationError):
|
| 132 |
+
"""Server-side validation error."""
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class GenerationError(TextGenerationError):
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class OverloadedError(TextGenerationError):
|
| 140 |
+
pass
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class IncompleteGenerationError(TextGenerationError):
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class UnknownError(TextGenerationError):
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# VALIDATION ERRORS
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class HFValidationError(ValueError):
|
| 155 |
+
"""Generic exception thrown by `huggingface_hub` validators.
|
| 156 |
+
|
| 157 |
+
Inherits from [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError).
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# FILE METADATA ERRORS
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class FileMetadataError(OSError):
|
| 165 |
+
"""Error triggered when the metadata of a file on the Hub cannot be retrieved (missing ETag or commit_hash).
|
| 166 |
+
|
| 167 |
+
Inherits from `OSError` for backward compatibility.
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# REPOSITORY ERRORS
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class RepositoryNotFoundError(HfHubHTTPError):
|
| 175 |
+
"""
|
| 176 |
+
Raised when trying to access a hf.co URL with an invalid repository name, or
|
| 177 |
+
with a private repo name the user does not have access to.
|
| 178 |
+
|
| 179 |
+
Example:
|
| 180 |
+
|
| 181 |
+
```py
|
| 182 |
+
>>> from huggingface_hub import model_info
|
| 183 |
+
>>> model_info("<non_existent_repository>")
|
| 184 |
+
(...)
|
| 185 |
+
huggingface_hub.utils._errors.RepositoryNotFoundError: 401 Client Error. (Request ID: PvMw_VjBMjVdMz53WKIzP)
|
| 186 |
+
|
| 187 |
+
Repository Not Found for url: https://huggingface.co/api/models/%3Cnon_existent_repository%3E.
|
| 188 |
+
Please make sure you specified the correct `repo_id` and `repo_type`.
|
| 189 |
+
If the repo is private, make sure you are authenticated.
|
| 190 |
+
Invalid username or password.
|
| 191 |
+
```
|
| 192 |
+
"""
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class GatedRepoError(RepositoryNotFoundError):
|
| 196 |
+
"""
|
| 197 |
+
Raised when trying to access a gated repository for which the user is not on the
|
| 198 |
+
authorized list.
|
| 199 |
+
|
| 200 |
+
Note: derives from `RepositoryNotFoundError` to ensure backward compatibility.
|
| 201 |
+
|
| 202 |
+
Example:
|
| 203 |
+
|
| 204 |
+
```py
|
| 205 |
+
>>> from huggingface_hub import model_info
|
| 206 |
+
>>> model_info("<gated_repository>")
|
| 207 |
+
(...)
|
| 208 |
+
huggingface_hub.utils._errors.GatedRepoError: 403 Client Error. (Request ID: ViT1Bf7O_026LGSQuVqfa)
|
| 209 |
+
|
| 210 |
+
Cannot access gated repo for url https://huggingface.co/api/models/ardent-figment/gated-model.
|
| 211 |
+
Access to model ardent-figment/gated-model is restricted and you are not in the authorized list.
|
| 212 |
+
Visit https://huggingface.co/ardent-figment/gated-model to ask for access.
|
| 213 |
+
```
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class DisabledRepoError(HfHubHTTPError):
|
| 218 |
+
"""
|
| 219 |
+
Raised when trying to access a repository that has been disabled by its author.
|
| 220 |
+
|
| 221 |
+
Example:
|
| 222 |
+
|
| 223 |
+
```py
|
| 224 |
+
>>> from huggingface_hub import dataset_info
|
| 225 |
+
>>> dataset_info("laion/laion-art")
|
| 226 |
+
(...)
|
| 227 |
+
huggingface_hub.utils._errors.DisabledRepoError: 403 Client Error. (Request ID: Root=1-659fc3fa-3031673e0f92c71a2260dbe2;bc6f4dfb-b30a-4862-af0a-5cfe827610d8)
|
| 228 |
+
|
| 229 |
+
Cannot access repository for url https://huggingface.co/api/datasets/laion/laion-art.
|
| 230 |
+
Access to this resource is disabled.
|
| 231 |
+
```
|
| 232 |
+
"""
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# REVISION ERROR
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
class RevisionNotFoundError(HfHubHTTPError):
|
| 239 |
+
"""
|
| 240 |
+
Raised when trying to access a hf.co URL with a valid repository but an invalid
|
| 241 |
+
revision.
|
| 242 |
+
|
| 243 |
+
Example:
|
| 244 |
+
|
| 245 |
+
```py
|
| 246 |
+
>>> from huggingface_hub import hf_hub_download
|
| 247 |
+
>>> hf_hub_download('bert-base-cased', 'config.json', revision='<non-existent-revision>')
|
| 248 |
+
(...)
|
| 249 |
+
huggingface_hub.utils._errors.RevisionNotFoundError: 404 Client Error. (Request ID: Mwhe_c3Kt650GcdKEFomX)
|
| 250 |
+
|
| 251 |
+
Revision Not Found for url: https://huggingface.co/bert-base-cased/resolve/%3Cnon-existent-revision%3E/config.json.
|
| 252 |
+
```
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# ENTRY ERRORS
|
| 257 |
+
class EntryNotFoundError(HfHubHTTPError):
|
| 258 |
+
"""
|
| 259 |
+
Raised when trying to access a hf.co URL with a valid repository and revision
|
| 260 |
+
but an invalid filename.
|
| 261 |
+
|
| 262 |
+
Example:
|
| 263 |
+
|
| 264 |
+
```py
|
| 265 |
+
>>> from huggingface_hub import hf_hub_download
|
| 266 |
+
>>> hf_hub_download('bert-base-cased', '<non-existent-file>')
|
| 267 |
+
(...)
|
| 268 |
+
huggingface_hub.utils._errors.EntryNotFoundError: 404 Client Error. (Request ID: 53pNl6M0MxsnG5Sw8JA6x)
|
| 269 |
+
|
| 270 |
+
Entry Not Found for url: https://huggingface.co/bert-base-cased/resolve/main/%3Cnon-existent-file%3E.
|
| 271 |
+
```
|
| 272 |
+
"""
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
class LocalEntryNotFoundError(EntryNotFoundError, FileNotFoundError, ValueError):
|
| 276 |
+
"""
|
| 277 |
+
Raised when trying to access a file or snapshot that is not on the disk when network is
|
| 278 |
+
disabled or unavailable (connection issue). The entry may exist on the Hub.
|
| 279 |
+
|
| 280 |
+
Note: `ValueError` type is to ensure backward compatibility.
|
| 281 |
+
Note: `LocalEntryNotFoundError` derives from `HTTPError` because of `EntryNotFoundError`
|
| 282 |
+
even when it is not a network issue.
|
| 283 |
+
|
| 284 |
+
Example:
|
| 285 |
+
|
| 286 |
+
```py
|
| 287 |
+
>>> from huggingface_hub import hf_hub_download
|
| 288 |
+
>>> hf_hub_download('bert-base-cased', '<non-cached-file>', local_files_only=True)
|
| 289 |
+
(...)
|
| 290 |
+
huggingface_hub.utils._errors.LocalEntryNotFoundError: Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable hf.co look-ups and downloads online, set 'local_files_only' to False.
|
| 291 |
+
```
|
| 292 |
+
"""
|
| 293 |
+
|
| 294 |
+
def __init__(self, message: str):
|
| 295 |
+
super().__init__(message, response=None)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
# REQUEST ERROR
|
| 299 |
+
class BadRequestError(HfHubHTTPError, ValueError):
|
| 300 |
+
"""
|
| 301 |
+
Raised by `hf_raise_for_status` when the server returns a HTTP 400 error.
|
| 302 |
+
|
| 303 |
+
Example:
|
| 304 |
+
|
| 305 |
+
```py
|
| 306 |
+
>>> resp = requests.post("hf.co/api/check", ...)
|
| 307 |
+
>>> hf_raise_for_status(resp, endpoint_name="check")
|
| 308 |
+
huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX)
|
| 309 |
+
```
|
| 310 |
+
"""
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# DDUF file format ERROR
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
class DDUFError(Exception):
|
| 317 |
+
"""Base exception for errors related to the DDUF format."""
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
class DDUFCorruptedFileError(DDUFError):
|
| 321 |
+
"""Exception thrown when the DDUF file is corrupted."""
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
class DDUFExportError(DDUFError):
|
| 325 |
+
"""Base exception for errors during DDUF export."""
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
class DDUFInvalidEntryNameError(DDUFExportError):
|
| 329 |
+
"""Exception thrown when the entry name is invalid."""
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# STRICT DATACLASSES ERRORS
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
class StrictDataclassError(Exception):
|
| 336 |
+
"""Base exception for strict dataclasses."""
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
class StrictDataclassDefinitionError(StrictDataclassError):
|
| 340 |
+
"""Exception thrown when a strict dataclass is defined incorrectly."""
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class StrictDataclassFieldValidationError(StrictDataclassError):
|
| 344 |
+
"""Exception thrown when a strict dataclass fails validation for a given field."""
|
| 345 |
+
|
| 346 |
+
def __init__(self, field: str, cause: Exception):
|
| 347 |
+
error_message = f"Validation error for field '{field}':"
|
| 348 |
+
error_message += f"\n {cause.__class__.__name__}: {cause}"
|
| 349 |
+
super().__init__(error_message)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
class StrictDataclassClassValidationError(StrictDataclassError):
|
| 353 |
+
"""Exception thrown when a strict dataclass fails validation on a class validator."""
|
| 354 |
+
|
| 355 |
+
def __init__(self, validator: str, cause: Exception):
|
| 356 |
+
error_message = f"Class validation error for validator '{validator}':"
|
| 357 |
+
error_message += f"\n {cause.__class__.__name__}: {cause}"
|
| 358 |
+
super().__init__(error_message)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# XET ERRORS
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
class XetError(Exception):
|
| 365 |
+
"""Base exception for errors related to Xet Storage."""
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
class XetAuthorizationError(XetError):
|
| 369 |
+
"""Exception thrown when the user does not have the right authorization to use Xet Storage."""
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
class XetRefreshTokenError(XetError):
|
| 373 |
+
"""Exception thrown when the refresh token is invalid."""
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
class XetDownloadError(Exception):
|
| 377 |
+
"""Exception thrown when the download from Xet Storage fails."""
|
venv/lib/python3.13/site-packages/huggingface_hub/fastai_utils.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from pickle import DEFAULT_PROTOCOL, PicklingError
|
| 5 |
+
from typing import Any, Dict, List, Optional, Union
|
| 6 |
+
|
| 7 |
+
from packaging import version
|
| 8 |
+
|
| 9 |
+
from huggingface_hub import constants, snapshot_download
|
| 10 |
+
from huggingface_hub.hf_api import HfApi
|
| 11 |
+
from huggingface_hub.utils import (
|
| 12 |
+
SoftTemporaryDirectory,
|
| 13 |
+
get_fastai_version,
|
| 14 |
+
get_fastcore_version,
|
| 15 |
+
get_python_version,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from .utils import logging, validate_hf_hub_args
|
| 19 |
+
from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility...
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
logger = logging.get_logger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _check_fastai_fastcore_versions(
|
| 26 |
+
fastai_min_version: str = "2.4",
|
| 27 |
+
fastcore_min_version: str = "1.3.27",
|
| 28 |
+
):
|
| 29 |
+
"""
|
| 30 |
+
Checks that the installed fastai and fastcore versions are compatible for pickle serialization.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
fastai_min_version (`str`, *optional*):
|
| 34 |
+
The minimum fastai version supported.
|
| 35 |
+
fastcore_min_version (`str`, *optional*):
|
| 36 |
+
The minimum fastcore version supported.
|
| 37 |
+
|
| 38 |
+
> [!TIP]
|
| 39 |
+
> Raises the following error:
|
| 40 |
+
>
|
| 41 |
+
> - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
| 42 |
+
> if the fastai or fastcore libraries are not available or are of an invalid version.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
if (get_fastcore_version() or get_fastai_version()) == "N/A":
|
| 46 |
+
raise ImportError(
|
| 47 |
+
f"fastai>={fastai_min_version} and fastcore>={fastcore_min_version} are"
|
| 48 |
+
f" required. Currently using fastai=={get_fastai_version()} and"
|
| 49 |
+
f" fastcore=={get_fastcore_version()}."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
current_fastai_version = version.Version(get_fastai_version())
|
| 53 |
+
current_fastcore_version = version.Version(get_fastcore_version())
|
| 54 |
+
|
| 55 |
+
if current_fastai_version < version.Version(fastai_min_version):
|
| 56 |
+
raise ImportError(
|
| 57 |
+
"`push_to_hub_fastai` and `from_pretrained_fastai` require a"
|
| 58 |
+
f" fastai>={fastai_min_version} version, but you are using fastai version"
|
| 59 |
+
f" {get_fastai_version()} which is incompatible. Upgrade with `pip install"
|
| 60 |
+
" fastai==2.5.6`."
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
if current_fastcore_version < version.Version(fastcore_min_version):
|
| 64 |
+
raise ImportError(
|
| 65 |
+
"`push_to_hub_fastai` and `from_pretrained_fastai` require a"
|
| 66 |
+
f" fastcore>={fastcore_min_version} version, but you are using fastcore"
|
| 67 |
+
f" version {get_fastcore_version()} which is incompatible. Upgrade with"
|
| 68 |
+
" `pip install fastcore==1.3.27`."
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _check_fastai_fastcore_pyproject_versions(
|
| 73 |
+
storage_folder: str,
|
| 74 |
+
fastai_min_version: str = "2.4",
|
| 75 |
+
fastcore_min_version: str = "1.3.27",
|
| 76 |
+
):
|
| 77 |
+
"""
|
| 78 |
+
Checks that the `pyproject.toml` file in the directory `storage_folder` has fastai and fastcore versions
|
| 79 |
+
that are compatible with `from_pretrained_fastai` and `push_to_hub_fastai`. If `pyproject.toml` does not exist
|
| 80 |
+
or does not contain versions for fastai and fastcore, then it logs a warning.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
storage_folder (`str`):
|
| 84 |
+
Folder to look for the `pyproject.toml` file.
|
| 85 |
+
fastai_min_version (`str`, *optional*):
|
| 86 |
+
The minimum fastai version supported.
|
| 87 |
+
fastcore_min_version (`str`, *optional*):
|
| 88 |
+
The minimum fastcore version supported.
|
| 89 |
+
|
| 90 |
+
> [!TIP]
|
| 91 |
+
> Raises the following errors:
|
| 92 |
+
>
|
| 93 |
+
> - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
| 94 |
+
> if the `toml` module is not installed.
|
| 95 |
+
> - [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
| 96 |
+
> if the `pyproject.toml` indicates a lower than minimum supported version of fastai or fastcore.
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
import toml
|
| 101 |
+
except ModuleNotFoundError:
|
| 102 |
+
raise ImportError(
|
| 103 |
+
"`push_to_hub_fastai` and `from_pretrained_fastai` require the toml module."
|
| 104 |
+
" Install it with `pip install toml`."
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Checks that a `pyproject.toml`, with `build-system` and `requires` sections, exists in the repository. If so, get a list of required packages.
|
| 108 |
+
if not os.path.isfile(f"{storage_folder}/pyproject.toml"):
|
| 109 |
+
logger.warning(
|
| 110 |
+
"There is no `pyproject.toml` in the repository that contains the fastai"
|
| 111 |
+
" `Learner`. The `pyproject.toml` would allow us to verify that your fastai"
|
| 112 |
+
" and fastcore versions are compatible with those of the model you want to"
|
| 113 |
+
" load."
|
| 114 |
+
)
|
| 115 |
+
return
|
| 116 |
+
pyproject_toml = toml.load(f"{storage_folder}/pyproject.toml")
|
| 117 |
+
|
| 118 |
+
if "build-system" not in pyproject_toml.keys():
|
| 119 |
+
logger.warning(
|
| 120 |
+
"There is no `build-system` section in the pyproject.toml of the repository"
|
| 121 |
+
" that contains the fastai `Learner`. The `build-system` would allow us to"
|
| 122 |
+
" verify that your fastai and fastcore versions are compatible with those"
|
| 123 |
+
" of the model you want to load."
|
| 124 |
+
)
|
| 125 |
+
return
|
| 126 |
+
build_system_toml = pyproject_toml["build-system"]
|
| 127 |
+
|
| 128 |
+
if "requires" not in build_system_toml.keys():
|
| 129 |
+
logger.warning(
|
| 130 |
+
"There is no `requires` section in the pyproject.toml of the repository"
|
| 131 |
+
" that contains the fastai `Learner`. The `requires` would allow us to"
|
| 132 |
+
" verify that your fastai and fastcore versions are compatible with those"
|
| 133 |
+
" of the model you want to load."
|
| 134 |
+
)
|
| 135 |
+
return
|
| 136 |
+
package_versions = build_system_toml["requires"]
|
| 137 |
+
|
| 138 |
+
# Extracts contains fastai and fastcore versions from `pyproject.toml` if available.
|
| 139 |
+
# If the package is specified but not the version (e.g. "fastai" instead of "fastai=2.4"), the default versions are the highest.
|
| 140 |
+
fastai_packages = [pck for pck in package_versions if pck.startswith("fastai")]
|
| 141 |
+
if len(fastai_packages) == 0:
|
| 142 |
+
logger.warning("The repository does not have a fastai version specified in the `pyproject.toml`.")
|
| 143 |
+
# fastai_version is an empty string if not specified
|
| 144 |
+
else:
|
| 145 |
+
fastai_version = str(fastai_packages[0]).partition("=")[2]
|
| 146 |
+
if fastai_version != "" and version.Version(fastai_version) < version.Version(fastai_min_version):
|
| 147 |
+
raise ImportError(
|
| 148 |
+
"`from_pretrained_fastai` requires"
|
| 149 |
+
f" fastai>={fastai_min_version} version but the model to load uses"
|
| 150 |
+
f" {fastai_version} which is incompatible."
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
fastcore_packages = [pck for pck in package_versions if pck.startswith("fastcore")]
|
| 154 |
+
if len(fastcore_packages) == 0:
|
| 155 |
+
logger.warning("The repository does not have a fastcore version specified in the `pyproject.toml`.")
|
| 156 |
+
# fastcore_version is an empty string if not specified
|
| 157 |
+
else:
|
| 158 |
+
fastcore_version = str(fastcore_packages[0]).partition("=")[2]
|
| 159 |
+
if fastcore_version != "" and version.Version(fastcore_version) < version.Version(fastcore_min_version):
|
| 160 |
+
raise ImportError(
|
| 161 |
+
"`from_pretrained_fastai` requires"
|
| 162 |
+
f" fastcore>={fastcore_min_version} version, but you are using fastcore"
|
| 163 |
+
f" version {fastcore_version} which is incompatible."
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
README_TEMPLATE = """---
|
| 168 |
+
tags:
|
| 169 |
+
- fastai
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
# Amazing!
|
| 173 |
+
|
| 174 |
+
🥳 Congratulations on hosting your fastai model on the Hugging Face Hub!
|
| 175 |
+
|
| 176 |
+
# Some next steps
|
| 177 |
+
1. Fill out this model card with more information (see the template below and the [documentation here](https://huggingface.co/docs/hub/model-repos))!
|
| 178 |
+
|
| 179 |
+
2. Create a demo in Gradio or Streamlit using 🤗 Spaces ([documentation here](https://huggingface.co/docs/hub/spaces)).
|
| 180 |
+
|
| 181 |
+
3. Join the fastai community on the [Fastai Discord](https://discord.com/invite/YKrxeNn)!
|
| 182 |
+
|
| 183 |
+
Greetings fellow fastlearner 🤝! Don't forget to delete this content from your model card.
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# Model card
|
| 190 |
+
|
| 191 |
+
## Model description
|
| 192 |
+
More information needed
|
| 193 |
+
|
| 194 |
+
## Intended uses & limitations
|
| 195 |
+
More information needed
|
| 196 |
+
|
| 197 |
+
## Training and evaluation data
|
| 198 |
+
More information needed
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
PYPROJECT_TEMPLATE = f"""[build-system]
|
| 202 |
+
requires = ["setuptools>=40.8.0", "wheel", "python={get_python_version()}", "fastai={get_fastai_version()}", "fastcore={get_fastcore_version()}"]
|
| 203 |
+
build-backend = "setuptools.build_meta:__legacy__"
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def _create_model_card(repo_dir: Path):
|
| 208 |
+
"""
|
| 209 |
+
Creates a model card for the repository.
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
repo_dir (`Path`):
|
| 213 |
+
Directory where model card is created.
|
| 214 |
+
"""
|
| 215 |
+
readme_path = repo_dir / "README.md"
|
| 216 |
+
|
| 217 |
+
if not readme_path.exists():
|
| 218 |
+
with readme_path.open("w", encoding="utf-8") as f:
|
| 219 |
+
f.write(README_TEMPLATE)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def _create_model_pyproject(repo_dir: Path):
|
| 223 |
+
"""
|
| 224 |
+
Creates a `pyproject.toml` for the repository.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
repo_dir (`Path`):
|
| 228 |
+
Directory where `pyproject.toml` is created.
|
| 229 |
+
"""
|
| 230 |
+
pyproject_path = repo_dir / "pyproject.toml"
|
| 231 |
+
|
| 232 |
+
if not pyproject_path.exists():
|
| 233 |
+
with pyproject_path.open("w", encoding="utf-8") as f:
|
| 234 |
+
f.write(PYPROJECT_TEMPLATE)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def _save_pretrained_fastai(
|
| 238 |
+
learner,
|
| 239 |
+
save_directory: Union[str, Path],
|
| 240 |
+
config: Optional[Dict[str, Any]] = None,
|
| 241 |
+
):
|
| 242 |
+
"""
|
| 243 |
+
Saves a fastai learner to `save_directory` in pickle format using the default pickle protocol for the version of python used.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
learner (`Learner`):
|
| 247 |
+
The `fastai.Learner` you'd like to save.
|
| 248 |
+
save_directory (`str` or `Path`):
|
| 249 |
+
Specific directory in which you want to save the fastai learner.
|
| 250 |
+
config (`dict`, *optional*):
|
| 251 |
+
Configuration object. Will be uploaded as a .json file. Example: 'https://huggingface.co/espejelomar/fastai-pet-breeds-classification/blob/main/config.json'.
|
| 252 |
+
|
| 253 |
+
> [!TIP]
|
| 254 |
+
> Raises the following error:
|
| 255 |
+
>
|
| 256 |
+
> - [`RuntimeError`](https://docs.python.org/3/library/exceptions.html#RuntimeError)
|
| 257 |
+
> if the config file provided is not a dictionary.
|
| 258 |
+
"""
|
| 259 |
+
_check_fastai_fastcore_versions()
|
| 260 |
+
|
| 261 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 262 |
+
|
| 263 |
+
# if the user provides config then we update it with the fastai and fastcore versions in CONFIG_TEMPLATE.
|
| 264 |
+
if config is not None:
|
| 265 |
+
if not isinstance(config, dict):
|
| 266 |
+
raise RuntimeError(f"Provided config should be a dict. Got: '{type(config)}'")
|
| 267 |
+
path = os.path.join(save_directory, constants.CONFIG_NAME)
|
| 268 |
+
with open(path, "w") as f:
|
| 269 |
+
json.dump(config, f)
|
| 270 |
+
|
| 271 |
+
_create_model_card(Path(save_directory))
|
| 272 |
+
_create_model_pyproject(Path(save_directory))
|
| 273 |
+
|
| 274 |
+
# learner.export saves the model in `self.path`.
|
| 275 |
+
learner.path = Path(save_directory)
|
| 276 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 277 |
+
try:
|
| 278 |
+
learner.export(
|
| 279 |
+
fname="model.pkl",
|
| 280 |
+
pickle_protocol=DEFAULT_PROTOCOL,
|
| 281 |
+
)
|
| 282 |
+
except PicklingError:
|
| 283 |
+
raise PicklingError(
|
| 284 |
+
"You are using a lambda function, i.e., an anonymous function. `pickle`"
|
| 285 |
+
" cannot pickle function objects and requires that all functions have"
|
| 286 |
+
" names. One possible solution is to name the function."
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
@validate_hf_hub_args
|
| 291 |
+
def from_pretrained_fastai(
|
| 292 |
+
repo_id: str,
|
| 293 |
+
revision: Optional[str] = None,
|
| 294 |
+
):
|
| 295 |
+
"""
|
| 296 |
+
Load pretrained fastai model from the Hub or from a local directory.
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
repo_id (`str`):
|
| 300 |
+
The location where the pickled fastai.Learner is. It can be either of the two:
|
| 301 |
+
- Hosted on the Hugging Face Hub. E.g.: 'espejelomar/fatai-pet-breeds-classification' or 'distilgpt2'.
|
| 302 |
+
You can add a `revision` by appending `@` at the end of `repo_id`. E.g.: `dbmdz/bert-base-german-cased@main`.
|
| 303 |
+
Revision is the specific model version to use. Since we use a git-based system for storing models and other
|
| 304 |
+
artifacts on the Hugging Face Hub, it can be a branch name, a tag name, or a commit id.
|
| 305 |
+
- Hosted locally. `repo_id` would be a directory containing the pickle and a pyproject.toml
|
| 306 |
+
indicating the fastai and fastcore versions used to build the `fastai.Learner`. E.g.: `./my_model_directory/`.
|
| 307 |
+
revision (`str`, *optional*):
|
| 308 |
+
Revision at which the repo's files are downloaded. See documentation of `snapshot_download`.
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
The `fastai.Learner` model in the `repo_id` repo.
|
| 312 |
+
"""
|
| 313 |
+
_check_fastai_fastcore_versions()
|
| 314 |
+
|
| 315 |
+
# Load the `repo_id` repo.
|
| 316 |
+
# `snapshot_download` returns the folder where the model was stored.
|
| 317 |
+
# `cache_dir` will be the default '/root/.cache/huggingface/hub'
|
| 318 |
+
if not os.path.isdir(repo_id):
|
| 319 |
+
storage_folder = snapshot_download(
|
| 320 |
+
repo_id=repo_id,
|
| 321 |
+
revision=revision,
|
| 322 |
+
library_name="fastai",
|
| 323 |
+
library_version=get_fastai_version(),
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
storage_folder = repo_id
|
| 327 |
+
|
| 328 |
+
_check_fastai_fastcore_pyproject_versions(storage_folder)
|
| 329 |
+
|
| 330 |
+
from fastai.learner import load_learner # type: ignore
|
| 331 |
+
|
| 332 |
+
return load_learner(os.path.join(storage_folder, "model.pkl"))
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
@validate_hf_hub_args
|
| 336 |
+
def push_to_hub_fastai(
|
| 337 |
+
learner,
|
| 338 |
+
*,
|
| 339 |
+
repo_id: str,
|
| 340 |
+
commit_message: str = "Push FastAI model using huggingface_hub.",
|
| 341 |
+
private: Optional[bool] = None,
|
| 342 |
+
token: Optional[str] = None,
|
| 343 |
+
config: Optional[dict] = None,
|
| 344 |
+
branch: Optional[str] = None,
|
| 345 |
+
create_pr: Optional[bool] = None,
|
| 346 |
+
allow_patterns: Optional[Union[List[str], str]] = None,
|
| 347 |
+
ignore_patterns: Optional[Union[List[str], str]] = None,
|
| 348 |
+
delete_patterns: Optional[Union[List[str], str]] = None,
|
| 349 |
+
api_endpoint: Optional[str] = None,
|
| 350 |
+
):
|
| 351 |
+
"""
|
| 352 |
+
Upload learner checkpoint files to the Hub.
|
| 353 |
+
|
| 354 |
+
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
|
| 355 |
+
`delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
|
| 356 |
+
details.
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
learner (`Learner`):
|
| 360 |
+
The `fastai.Learner' you'd like to push to the Hub.
|
| 361 |
+
repo_id (`str`):
|
| 362 |
+
The repository id for your model in Hub in the format of "namespace/repo_name". The namespace can be your individual account or an organization to which you have write access (for example, 'stanfordnlp/stanza-de').
|
| 363 |
+
commit_message (`str`, *optional*):
|
| 364 |
+
Message to commit while pushing. Will default to :obj:`"add model"`.
|
| 365 |
+
private (`bool`, *optional*):
|
| 366 |
+
Whether or not the repository created should be private.
|
| 367 |
+
If `None` (default), will default to been public except if the organization's default is private.
|
| 368 |
+
token (`str`, *optional*):
|
| 369 |
+
The Hugging Face account token to use as HTTP bearer authorization for remote files. If :obj:`None`, the token will be asked by a prompt.
|
| 370 |
+
config (`dict`, *optional*):
|
| 371 |
+
Configuration object to be saved alongside the model weights.
|
| 372 |
+
branch (`str`, *optional*):
|
| 373 |
+
The git branch on which to push the model. This defaults to
|
| 374 |
+
the default branch as specified in your repository, which
|
| 375 |
+
defaults to `"main"`.
|
| 376 |
+
create_pr (`boolean`, *optional*):
|
| 377 |
+
Whether or not to create a Pull Request from `branch` with that commit.
|
| 378 |
+
Defaults to `False`.
|
| 379 |
+
api_endpoint (`str`, *optional*):
|
| 380 |
+
The API endpoint to use when pushing the model to the hub.
|
| 381 |
+
allow_patterns (`List[str]` or `str`, *optional*):
|
| 382 |
+
If provided, only files matching at least one pattern are pushed.
|
| 383 |
+
ignore_patterns (`List[str]` or `str`, *optional*):
|
| 384 |
+
If provided, files matching any of the patterns are not pushed.
|
| 385 |
+
delete_patterns (`List[str]` or `str`, *optional*):
|
| 386 |
+
If provided, remote files matching any of the patterns will be deleted from the repo.
|
| 387 |
+
|
| 388 |
+
Returns:
|
| 389 |
+
The url of the commit of your model in the given repository.
|
| 390 |
+
|
| 391 |
+
> [!TIP]
|
| 392 |
+
> Raises the following error:
|
| 393 |
+
>
|
| 394 |
+
> - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 395 |
+
> if the user is not log on to the Hugging Face Hub.
|
| 396 |
+
"""
|
| 397 |
+
_check_fastai_fastcore_versions()
|
| 398 |
+
api = HfApi(endpoint=api_endpoint)
|
| 399 |
+
repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
|
| 400 |
+
|
| 401 |
+
# Push the files to the repo in a single commit
|
| 402 |
+
with SoftTemporaryDirectory() as tmp:
|
| 403 |
+
saved_path = Path(tmp) / repo_id
|
| 404 |
+
_save_pretrained_fastai(learner, saved_path, config=config)
|
| 405 |
+
return api.upload_folder(
|
| 406 |
+
repo_id=repo_id,
|
| 407 |
+
token=token,
|
| 408 |
+
folder_path=saved_path,
|
| 409 |
+
commit_message=commit_message,
|
| 410 |
+
revision=branch,
|
| 411 |
+
create_pr=create_pr,
|
| 412 |
+
allow_patterns=allow_patterns,
|
| 413 |
+
ignore_patterns=ignore_patterns,
|
| 414 |
+
delete_patterns=delete_patterns,
|
| 415 |
+
)
|
venv/lib/python3.13/site-packages/huggingface_hub/file_download.py
ADDED
|
@@ -0,0 +1,1813 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import errno
|
| 3 |
+
import inspect
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
+
import stat
|
| 8 |
+
import time
|
| 9 |
+
import uuid
|
| 10 |
+
import warnings
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
|
| 14 |
+
from urllib.parse import quote, urlparse
|
| 15 |
+
|
| 16 |
+
import requests
|
| 17 |
+
|
| 18 |
+
from . import (
|
| 19 |
+
__version__, # noqa: F401 # for backward compatibility
|
| 20 |
+
constants,
|
| 21 |
+
)
|
| 22 |
+
from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
|
| 23 |
+
from .constants import (
|
| 24 |
+
HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
|
| 25 |
+
HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
|
| 26 |
+
)
|
| 27 |
+
from .errors import (
|
| 28 |
+
EntryNotFoundError,
|
| 29 |
+
FileMetadataError,
|
| 30 |
+
GatedRepoError,
|
| 31 |
+
HfHubHTTPError,
|
| 32 |
+
LocalEntryNotFoundError,
|
| 33 |
+
RepositoryNotFoundError,
|
| 34 |
+
RevisionNotFoundError,
|
| 35 |
+
)
|
| 36 |
+
from .utils import (
|
| 37 |
+
OfflineModeIsEnabled,
|
| 38 |
+
SoftTemporaryDirectory,
|
| 39 |
+
WeakFileLock,
|
| 40 |
+
XetFileData,
|
| 41 |
+
build_hf_headers,
|
| 42 |
+
get_fastai_version, # noqa: F401 # for backward compatibility
|
| 43 |
+
get_fastcore_version, # noqa: F401 # for backward compatibility
|
| 44 |
+
get_graphviz_version, # noqa: F401 # for backward compatibility
|
| 45 |
+
get_jinja_version, # noqa: F401 # for backward compatibility
|
| 46 |
+
get_pydot_version, # noqa: F401 # for backward compatibility
|
| 47 |
+
get_tf_version, # noqa: F401 # for backward compatibility
|
| 48 |
+
get_torch_version, # noqa: F401 # for backward compatibility
|
| 49 |
+
hf_raise_for_status,
|
| 50 |
+
is_fastai_available, # noqa: F401 # for backward compatibility
|
| 51 |
+
is_fastcore_available, # noqa: F401 # for backward compatibility
|
| 52 |
+
is_graphviz_available, # noqa: F401 # for backward compatibility
|
| 53 |
+
is_jinja_available, # noqa: F401 # for backward compatibility
|
| 54 |
+
is_pydot_available, # noqa: F401 # for backward compatibility
|
| 55 |
+
is_tf_available, # noqa: F401 # for backward compatibility
|
| 56 |
+
is_torch_available, # noqa: F401 # for backward compatibility
|
| 57 |
+
logging,
|
| 58 |
+
parse_xet_file_data_from_response,
|
| 59 |
+
refresh_xet_connection_info,
|
| 60 |
+
reset_sessions,
|
| 61 |
+
tqdm,
|
| 62 |
+
validate_hf_hub_args,
|
| 63 |
+
)
|
| 64 |
+
from .utils._http import _adjust_range_header, http_backoff
|
| 65 |
+
from .utils._runtime import _PY_VERSION, is_xet_available # noqa: F401 # for backward compatibility
|
| 66 |
+
from .utils._typing import HTTP_METHOD_T
|
| 67 |
+
from .utils.sha import sha_fileobj
|
| 68 |
+
from .utils.tqdm import _get_progress_bar_context
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
logger = logging.get_logger(__name__)
|
| 72 |
+
|
| 73 |
+
# Return value when trying to load a file from cache but the file does not exist in the distant repo.
|
| 74 |
+
_CACHED_NO_EXIST = object()
|
| 75 |
+
_CACHED_NO_EXIST_T = Any
|
| 76 |
+
|
| 77 |
+
# Regex to get filename from a "Content-Disposition" header for CDN-served files
|
| 78 |
+
HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
|
| 79 |
+
|
| 80 |
+
# Regex to check if the revision IS directly a commit_hash
|
| 81 |
+
REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
| 82 |
+
|
| 83 |
+
# Regex to check if the file etag IS a valid sha256
|
| 84 |
+
REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
|
| 85 |
+
|
| 86 |
+
_are_symlinks_supported_in_dir: Dict[str, bool] = {}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
|
| 90 |
+
"""Return whether the symlinks are supported on the machine.
|
| 91 |
+
|
| 92 |
+
Since symlinks support can change depending on the mounted disk, we need to check
|
| 93 |
+
on the precise cache folder. By default, the default HF cache directory is checked.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
cache_dir (`str`, `Path`, *optional*):
|
| 97 |
+
Path to the folder where cached files are stored.
|
| 98 |
+
|
| 99 |
+
Returns: [bool] Whether symlinks are supported in the directory.
|
| 100 |
+
"""
|
| 101 |
+
# Defaults to HF cache
|
| 102 |
+
if cache_dir is None:
|
| 103 |
+
cache_dir = constants.HF_HUB_CACHE
|
| 104 |
+
cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
|
| 105 |
+
|
| 106 |
+
# Check symlink compatibility only once (per cache directory) at first time use
|
| 107 |
+
if cache_dir not in _are_symlinks_supported_in_dir:
|
| 108 |
+
_are_symlinks_supported_in_dir[cache_dir] = True
|
| 109 |
+
|
| 110 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 111 |
+
with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
|
| 112 |
+
src_path = Path(tmpdir) / "dummy_file_src"
|
| 113 |
+
src_path.touch()
|
| 114 |
+
dst_path = Path(tmpdir) / "dummy_file_dst"
|
| 115 |
+
|
| 116 |
+
# Relative source path as in `_create_symlink``
|
| 117 |
+
relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
|
| 118 |
+
try:
|
| 119 |
+
os.symlink(relative_src, dst_path)
|
| 120 |
+
except OSError:
|
| 121 |
+
# Likely running on Windows
|
| 122 |
+
_are_symlinks_supported_in_dir[cache_dir] = False
|
| 123 |
+
|
| 124 |
+
if not constants.HF_HUB_DISABLE_SYMLINKS_WARNING:
|
| 125 |
+
message = (
|
| 126 |
+
"`huggingface_hub` cache-system uses symlinks by default to"
|
| 127 |
+
" efficiently store duplicated files but your machine does not"
|
| 128 |
+
f" support them in {cache_dir}. Caching files will still work"
|
| 129 |
+
" but in a degraded version that might require more space on"
|
| 130 |
+
" your disk. This warning can be disabled by setting the"
|
| 131 |
+
" `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For"
|
| 132 |
+
" more details, see"
|
| 133 |
+
" https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations."
|
| 134 |
+
)
|
| 135 |
+
if os.name == "nt":
|
| 136 |
+
message += (
|
| 137 |
+
"\nTo support symlinks on Windows, you either need to"
|
| 138 |
+
" activate Developer Mode or to run Python as an"
|
| 139 |
+
" administrator. In order to activate developer mode,"
|
| 140 |
+
" see this article:"
|
| 141 |
+
" https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
|
| 142 |
+
)
|
| 143 |
+
warnings.warn(message)
|
| 144 |
+
|
| 145 |
+
return _are_symlinks_supported_in_dir[cache_dir]
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
@dataclass(frozen=True)
|
| 149 |
+
class HfFileMetadata:
|
| 150 |
+
"""Data structure containing information about a file versioned on the Hub.
|
| 151 |
+
|
| 152 |
+
Returned by [`get_hf_file_metadata`] based on a URL.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
commit_hash (`str`, *optional*):
|
| 156 |
+
The commit_hash related to the file.
|
| 157 |
+
etag (`str`, *optional*):
|
| 158 |
+
Etag of the file on the server.
|
| 159 |
+
location (`str`):
|
| 160 |
+
Location where to download the file. Can be a Hub url or not (CDN).
|
| 161 |
+
size (`size`):
|
| 162 |
+
Size of the file. In case of an LFS file, contains the size of the actual
|
| 163 |
+
LFS file, not the pointer.
|
| 164 |
+
xet_file_data (`XetFileData`, *optional*):
|
| 165 |
+
Xet information for the file. This is only set if the file is stored using Xet storage.
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
+
commit_hash: Optional[str]
|
| 169 |
+
etag: Optional[str]
|
| 170 |
+
location: str
|
| 171 |
+
size: Optional[int]
|
| 172 |
+
xet_file_data: Optional[XetFileData]
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@validate_hf_hub_args
|
| 176 |
+
def hf_hub_url(
|
| 177 |
+
repo_id: str,
|
| 178 |
+
filename: str,
|
| 179 |
+
*,
|
| 180 |
+
subfolder: Optional[str] = None,
|
| 181 |
+
repo_type: Optional[str] = None,
|
| 182 |
+
revision: Optional[str] = None,
|
| 183 |
+
endpoint: Optional[str] = None,
|
| 184 |
+
) -> str:
|
| 185 |
+
"""Construct the URL of a file from the given information.
|
| 186 |
+
|
| 187 |
+
The resolved address can either be a huggingface.co-hosted url, or a link to
|
| 188 |
+
Cloudfront (a Content Delivery Network, or CDN) for large files which are
|
| 189 |
+
more than a few MBs.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
repo_id (`str`):
|
| 193 |
+
A namespace (user or an organization) name and a repo name separated
|
| 194 |
+
by a `/`.
|
| 195 |
+
filename (`str`):
|
| 196 |
+
The name of the file in the repo.
|
| 197 |
+
subfolder (`str`, *optional*):
|
| 198 |
+
An optional value corresponding to a folder inside the repo.
|
| 199 |
+
repo_type (`str`, *optional*):
|
| 200 |
+
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
| 201 |
+
`None` or `"model"` if downloading from a model. Default is `None`.
|
| 202 |
+
revision (`str`, *optional*):
|
| 203 |
+
An optional Git revision id which can be a branch name, a tag, or a
|
| 204 |
+
commit hash.
|
| 205 |
+
|
| 206 |
+
Example:
|
| 207 |
+
|
| 208 |
+
```python
|
| 209 |
+
>>> from huggingface_hub import hf_hub_url
|
| 210 |
+
|
| 211 |
+
>>> hf_hub_url(
|
| 212 |
+
... repo_id="julien-c/EsperBERTo-small", filename="pytorch_model.bin"
|
| 213 |
+
... )
|
| 214 |
+
'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
> [!TIP]
|
| 218 |
+
> Notes:
|
| 219 |
+
>
|
| 220 |
+
> Cloudfront is replicated over the globe so downloads are way faster for
|
| 221 |
+
> the end user (and it also lowers our bandwidth costs).
|
| 222 |
+
>
|
| 223 |
+
> Cloudfront aggressively caches files by default (default TTL is 24
|
| 224 |
+
> hours), however this is not an issue here because we implement a
|
| 225 |
+
> git-based versioning system on huggingface.co, which means that we store
|
| 226 |
+
> the files on S3/Cloudfront in a content-addressable way (i.e., the file
|
| 227 |
+
> name is its hash). Using content-addressable filenames means cache can't
|
| 228 |
+
> ever be stale.
|
| 229 |
+
>
|
| 230 |
+
> In terms of client-side caching from this library, we base our caching
|
| 231 |
+
> on the objects' entity tag (`ETag`), which is an identifier of a
|
| 232 |
+
> specific version of a resource [1]_. An object's ETag is: its git-sha1
|
| 233 |
+
> if stored in git, or its sha256 if stored in git-lfs.
|
| 234 |
+
|
| 235 |
+
References:
|
| 236 |
+
|
| 237 |
+
- [1] https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
|
| 238 |
+
"""
|
| 239 |
+
if subfolder == "":
|
| 240 |
+
subfolder = None
|
| 241 |
+
if subfolder is not None:
|
| 242 |
+
filename = f"{subfolder}/{filename}"
|
| 243 |
+
|
| 244 |
+
if repo_type not in constants.REPO_TYPES:
|
| 245 |
+
raise ValueError("Invalid repo type")
|
| 246 |
+
|
| 247 |
+
if repo_type in constants.REPO_TYPES_URL_PREFIXES:
|
| 248 |
+
repo_id = constants.REPO_TYPES_URL_PREFIXES[repo_type] + repo_id
|
| 249 |
+
|
| 250 |
+
if revision is None:
|
| 251 |
+
revision = constants.DEFAULT_REVISION
|
| 252 |
+
url = HUGGINGFACE_CO_URL_TEMPLATE.format(
|
| 253 |
+
repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
|
| 254 |
+
)
|
| 255 |
+
# Update endpoint if provided
|
| 256 |
+
if endpoint is not None and url.startswith(constants.ENDPOINT):
|
| 257 |
+
url = endpoint + url[len(constants.ENDPOINT) :]
|
| 258 |
+
return url
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _request_wrapper(
|
| 262 |
+
method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
|
| 263 |
+
) -> requests.Response:
|
| 264 |
+
"""Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
|
| 265 |
+
`allow_redirection=False`.
|
| 266 |
+
|
| 267 |
+
A backoff mechanism retries the HTTP call on 5xx errors and network errors.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
method (`str`):
|
| 271 |
+
HTTP method, such as 'GET' or 'HEAD'.
|
| 272 |
+
url (`str`):
|
| 273 |
+
The URL of the resource to fetch.
|
| 274 |
+
follow_relative_redirects (`bool`, *optional*, defaults to `False`)
|
| 275 |
+
If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
|
| 276 |
+
kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
|
| 277 |
+
following redirection to a CDN.
|
| 278 |
+
**params (`dict`, *optional*):
|
| 279 |
+
Params to pass to `requests.request`.
|
| 280 |
+
"""
|
| 281 |
+
# Recursively follow relative redirects
|
| 282 |
+
if follow_relative_redirects:
|
| 283 |
+
response = _request_wrapper(
|
| 284 |
+
method=method,
|
| 285 |
+
url=url,
|
| 286 |
+
follow_relative_redirects=False,
|
| 287 |
+
**params,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# If redirection, we redirect only relative paths.
|
| 291 |
+
# This is useful in case of a renamed repository.
|
| 292 |
+
if 300 <= response.status_code <= 399:
|
| 293 |
+
parsed_target = urlparse(response.headers["Location"])
|
| 294 |
+
if parsed_target.netloc == "":
|
| 295 |
+
# This means it is a relative 'location' headers, as allowed by RFC 7231.
|
| 296 |
+
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
|
| 297 |
+
# We want to follow this relative redirect !
|
| 298 |
+
#
|
| 299 |
+
# Highly inspired by `resolve_redirects` from requests library.
|
| 300 |
+
# See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
|
| 301 |
+
next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
|
| 302 |
+
return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
|
| 303 |
+
return response
|
| 304 |
+
|
| 305 |
+
# Perform request and return if status_code is not in the retry list.
|
| 306 |
+
response = http_backoff(method=method, url=url, **params)
|
| 307 |
+
hf_raise_for_status(response)
|
| 308 |
+
return response
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def _get_file_length_from_http_response(response: requests.Response) -> Optional[int]:
|
| 312 |
+
"""
|
| 313 |
+
Get the length of the file from the HTTP response headers.
|
| 314 |
+
|
| 315 |
+
This function extracts the file size from the HTTP response headers, either from the
|
| 316 |
+
`Content-Range` or `Content-Length` header, if available (in that order).
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
response (`requests.Response`):
|
| 320 |
+
The HTTP response object.
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
`int` or `None`: The length of the file in bytes, or None if not available.
|
| 324 |
+
"""
|
| 325 |
+
|
| 326 |
+
# If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
|
| 327 |
+
# contain the length of the compressed body, not the uncompressed file size.
|
| 328 |
+
# And at the start of transmission there's no way to know the uncompressed file size for gzip,
|
| 329 |
+
# thus we return None in that case.
|
| 330 |
+
content_encoding = response.headers.get("Content-Encoding", "identity").lower()
|
| 331 |
+
if content_encoding != "identity":
|
| 332 |
+
# gzip/br/deflate/zstd etc
|
| 333 |
+
return None
|
| 334 |
+
|
| 335 |
+
content_range = response.headers.get("Content-Range")
|
| 336 |
+
if content_range is not None:
|
| 337 |
+
return int(content_range.rsplit("/")[-1])
|
| 338 |
+
|
| 339 |
+
content_length = response.headers.get("Content-Length")
|
| 340 |
+
if content_length is not None:
|
| 341 |
+
return int(content_length)
|
| 342 |
+
|
| 343 |
+
return None
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def http_get(
|
| 347 |
+
url: str,
|
| 348 |
+
temp_file: BinaryIO,
|
| 349 |
+
*,
|
| 350 |
+
proxies: Optional[Dict] = None,
|
| 351 |
+
resume_size: int = 0,
|
| 352 |
+
headers: Optional[Dict[str, Any]] = None,
|
| 353 |
+
expected_size: Optional[int] = None,
|
| 354 |
+
displayed_filename: Optional[str] = None,
|
| 355 |
+
_nb_retries: int = 5,
|
| 356 |
+
_tqdm_bar: Optional[tqdm] = None,
|
| 357 |
+
) -> None:
|
| 358 |
+
"""
|
| 359 |
+
Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
|
| 360 |
+
|
| 361 |
+
If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
|
| 362 |
+
transient error (network outage?). We log a warning message and try to resume the download a few times before
|
| 363 |
+
giving up. The method gives up after 5 attempts if no new data has being received from the server.
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
url (`str`):
|
| 367 |
+
The URL of the file to download.
|
| 368 |
+
temp_file (`BinaryIO`):
|
| 369 |
+
The file-like object where to save the file.
|
| 370 |
+
proxies (`dict`, *optional*):
|
| 371 |
+
Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
|
| 372 |
+
resume_size (`int`, *optional*):
|
| 373 |
+
The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
|
| 374 |
+
positive number, the download will resume at the given position.
|
| 375 |
+
headers (`dict`, *optional*):
|
| 376 |
+
Dictionary of HTTP Headers to send with the request.
|
| 377 |
+
expected_size (`int`, *optional*):
|
| 378 |
+
The expected size of the file to download. If set, the download will raise an error if the size of the
|
| 379 |
+
received content is different from the expected one.
|
| 380 |
+
displayed_filename (`str`, *optional*):
|
| 381 |
+
The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
|
| 382 |
+
not set, the filename is guessed from the URL or the `Content-Disposition` header.
|
| 383 |
+
"""
|
| 384 |
+
if expected_size is not None and resume_size == expected_size:
|
| 385 |
+
# If the file is already fully downloaded, we don't need to download it again.
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
has_custom_range_header = headers is not None and any(h.lower() == "range" for h in headers)
|
| 389 |
+
hf_transfer = None
|
| 390 |
+
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
| 391 |
+
if resume_size != 0:
|
| 392 |
+
warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
|
| 393 |
+
elif proxies is not None:
|
| 394 |
+
warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
|
| 395 |
+
elif has_custom_range_header:
|
| 396 |
+
warnings.warn("'hf_transfer' ignores custom 'Range' headers; falling back to regular download method")
|
| 397 |
+
else:
|
| 398 |
+
try:
|
| 399 |
+
import hf_transfer # type: ignore[no-redef]
|
| 400 |
+
except ImportError:
|
| 401 |
+
raise ValueError(
|
| 402 |
+
"Fast download using 'hf_transfer' is enabled"
|
| 403 |
+
" (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
|
| 404 |
+
" available in your environment. Try `pip install hf_transfer`."
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
initial_headers = headers
|
| 408 |
+
headers = copy.deepcopy(headers) or {}
|
| 409 |
+
if resume_size > 0:
|
| 410 |
+
headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
|
| 411 |
+
elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
|
| 412 |
+
# Any files over 50GB will not be available through basic http request.
|
| 413 |
+
# Setting the range header to 0-0 will force the server to return the file size in the Content-Range header.
|
| 414 |
+
# Since hf_transfer splits the download into chunks, the process will succeed afterwards.
|
| 415 |
+
if hf_transfer:
|
| 416 |
+
headers["Range"] = "bytes=0-0"
|
| 417 |
+
else:
|
| 418 |
+
raise ValueError(
|
| 419 |
+
"The file is too large to be downloaded using the regular download method. Use `hf_transfer` or `hf_xet` instead."
|
| 420 |
+
" Try `pip install hf_transfer` or `pip install hf_xet`."
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
r = _request_wrapper(
|
| 424 |
+
method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
hf_raise_for_status(r)
|
| 428 |
+
total: Optional[int] = _get_file_length_from_http_response(r)
|
| 429 |
+
|
| 430 |
+
if displayed_filename is None:
|
| 431 |
+
displayed_filename = url
|
| 432 |
+
content_disposition = r.headers.get("Content-Disposition")
|
| 433 |
+
if content_disposition is not None:
|
| 434 |
+
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
| 435 |
+
if match is not None:
|
| 436 |
+
# Means file is on CDN
|
| 437 |
+
displayed_filename = match.groupdict()["filename"]
|
| 438 |
+
|
| 439 |
+
# Truncate filename if too long to display
|
| 440 |
+
if len(displayed_filename) > 40:
|
| 441 |
+
displayed_filename = f"(…){displayed_filename[-40:]}"
|
| 442 |
+
|
| 443 |
+
consistency_error_message = (
|
| 444 |
+
f"Consistency check failed: file should be of size {expected_size} but has size"
|
| 445 |
+
f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
|
| 446 |
+
" Please retry with `force_download=True`."
|
| 447 |
+
)
|
| 448 |
+
progress_cm = _get_progress_bar_context(
|
| 449 |
+
desc=displayed_filename,
|
| 450 |
+
log_level=logger.getEffectiveLevel(),
|
| 451 |
+
total=total,
|
| 452 |
+
initial=resume_size,
|
| 453 |
+
name="huggingface_hub.http_get",
|
| 454 |
+
_tqdm_bar=_tqdm_bar,
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
with progress_cm as progress:
|
| 458 |
+
if hf_transfer and total is not None and total > 5 * constants.DOWNLOAD_CHUNK_SIZE:
|
| 459 |
+
supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
|
| 460 |
+
if not supports_callback:
|
| 461 |
+
warnings.warn(
|
| 462 |
+
"You are using an outdated version of `hf_transfer`. "
|
| 463 |
+
"Consider upgrading to latest version to enable progress bars "
|
| 464 |
+
"using `pip install -U hf_transfer`."
|
| 465 |
+
)
|
| 466 |
+
try:
|
| 467 |
+
hf_transfer.download(
|
| 468 |
+
url=url,
|
| 469 |
+
filename=temp_file.name,
|
| 470 |
+
max_files=constants.HF_TRANSFER_CONCURRENCY,
|
| 471 |
+
chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
|
| 472 |
+
headers=initial_headers,
|
| 473 |
+
parallel_failures=3,
|
| 474 |
+
max_retries=5,
|
| 475 |
+
**({"callback": progress.update} if supports_callback else {}),
|
| 476 |
+
)
|
| 477 |
+
except Exception as e:
|
| 478 |
+
raise RuntimeError(
|
| 479 |
+
"An error occurred while downloading using `hf_transfer`. Consider"
|
| 480 |
+
" disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
|
| 481 |
+
) from e
|
| 482 |
+
if not supports_callback:
|
| 483 |
+
progress.update(total)
|
| 484 |
+
if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
|
| 485 |
+
raise EnvironmentError(
|
| 486 |
+
consistency_error_message.format(
|
| 487 |
+
actual_size=os.path.getsize(temp_file.name),
|
| 488 |
+
)
|
| 489 |
+
)
|
| 490 |
+
return
|
| 491 |
+
new_resume_size = resume_size
|
| 492 |
+
try:
|
| 493 |
+
for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
| 494 |
+
if chunk: # filter out keep-alive new chunks
|
| 495 |
+
progress.update(len(chunk))
|
| 496 |
+
temp_file.write(chunk)
|
| 497 |
+
new_resume_size += len(chunk)
|
| 498 |
+
# Some data has been downloaded from the server so we reset the number of retries.
|
| 499 |
+
_nb_retries = 5
|
| 500 |
+
except (requests.ConnectionError, requests.ReadTimeout) as e:
|
| 501 |
+
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
| 502 |
+
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
| 503 |
+
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
| 504 |
+
if _nb_retries <= 0:
|
| 505 |
+
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
| 506 |
+
raise
|
| 507 |
+
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
| 508 |
+
time.sleep(1)
|
| 509 |
+
reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
|
| 510 |
+
return http_get(
|
| 511 |
+
url=url,
|
| 512 |
+
temp_file=temp_file,
|
| 513 |
+
proxies=proxies,
|
| 514 |
+
resume_size=new_resume_size,
|
| 515 |
+
headers=initial_headers,
|
| 516 |
+
expected_size=expected_size,
|
| 517 |
+
_nb_retries=_nb_retries - 1,
|
| 518 |
+
_tqdm_bar=_tqdm_bar,
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
if expected_size is not None and expected_size != temp_file.tell():
|
| 522 |
+
raise EnvironmentError(
|
| 523 |
+
consistency_error_message.format(
|
| 524 |
+
actual_size=temp_file.tell(),
|
| 525 |
+
)
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
def xet_get(
|
| 530 |
+
*,
|
| 531 |
+
incomplete_path: Path,
|
| 532 |
+
xet_file_data: XetFileData,
|
| 533 |
+
headers: Dict[str, str],
|
| 534 |
+
expected_size: Optional[int] = None,
|
| 535 |
+
displayed_filename: Optional[str] = None,
|
| 536 |
+
_tqdm_bar: Optional[tqdm] = None,
|
| 537 |
+
) -> None:
|
| 538 |
+
"""
|
| 539 |
+
Download a file using Xet storage service.
|
| 540 |
+
|
| 541 |
+
Args:
|
| 542 |
+
incomplete_path (`Path`):
|
| 543 |
+
The path to the file to download.
|
| 544 |
+
xet_file_data (`XetFileData`):
|
| 545 |
+
The file metadata needed to make the request to the xet storage service.
|
| 546 |
+
headers (`Dict[str, str]`):
|
| 547 |
+
The headers to send to the xet storage service.
|
| 548 |
+
expected_size (`int`, *optional*):
|
| 549 |
+
The expected size of the file to download. If set, the download will raise an error if the size of the
|
| 550 |
+
received content is different from the expected one.
|
| 551 |
+
displayed_filename (`str`, *optional*):
|
| 552 |
+
The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
|
| 553 |
+
not set, the filename is guessed from the URL or the `Content-Disposition` header.
|
| 554 |
+
|
| 555 |
+
**How it works:**
|
| 556 |
+
The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
|
| 557 |
+
for efficient storage and transfer.
|
| 558 |
+
|
| 559 |
+
`hf_xet.download_files` manages downloading files by:
|
| 560 |
+
- Taking a list of files to download (each with its unique content hash)
|
| 561 |
+
- Connecting to a storage server (CAS server) that knows how files are chunked
|
| 562 |
+
- Using authentication to ensure secure access
|
| 563 |
+
- Providing progress updates during download
|
| 564 |
+
|
| 565 |
+
Authentication works by regularly refreshing access tokens through `refresh_xet_connection_info` to maintain a valid
|
| 566 |
+
connection to the storage server.
|
| 567 |
+
|
| 568 |
+
The download process works like this:
|
| 569 |
+
1. Create a local cache folder at `~/.cache/huggingface/xet/chunk-cache` to store reusable file chunks
|
| 570 |
+
2. Download files in parallel:
|
| 571 |
+
2.1. Prepare to write the file to disk
|
| 572 |
+
2.2. Ask the server "how is this file split into chunks?" using the file's unique hash
|
| 573 |
+
The server responds with:
|
| 574 |
+
- Which chunks make up the complete file
|
| 575 |
+
- Where each chunk can be downloaded from
|
| 576 |
+
2.3. For each needed chunk:
|
| 577 |
+
- Checks if we already have it in our local cache
|
| 578 |
+
- If not, download it from cloud storage (S3)
|
| 579 |
+
- Save it to cache for future use
|
| 580 |
+
- Assemble the chunks in order to recreate the original file
|
| 581 |
+
|
| 582 |
+
"""
|
| 583 |
+
try:
|
| 584 |
+
from hf_xet import PyXetDownloadInfo, download_files # type: ignore[no-redef]
|
| 585 |
+
except ImportError:
|
| 586 |
+
raise ValueError(
|
| 587 |
+
"To use optimized download using Xet storage, you need to install the hf_xet package. "
|
| 588 |
+
'Try `pip install "huggingface_hub[hf_xet]"` or `pip install hf_xet`.'
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
| 592 |
+
|
| 593 |
+
def token_refresher() -> Tuple[str, int]:
|
| 594 |
+
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
| 595 |
+
if connection_info is None:
|
| 596 |
+
raise ValueError("Failed to refresh token using xet metadata.")
|
| 597 |
+
return connection_info.access_token, connection_info.expiration_unix_epoch
|
| 598 |
+
|
| 599 |
+
xet_download_info = [
|
| 600 |
+
PyXetDownloadInfo(
|
| 601 |
+
destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
|
| 602 |
+
)
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
if not displayed_filename:
|
| 606 |
+
displayed_filename = incomplete_path.name
|
| 607 |
+
|
| 608 |
+
# Truncate filename if too long to display
|
| 609 |
+
if len(displayed_filename) > 40:
|
| 610 |
+
displayed_filename = f"{displayed_filename[:40]}(…)"
|
| 611 |
+
|
| 612 |
+
progress_cm = _get_progress_bar_context(
|
| 613 |
+
desc=displayed_filename,
|
| 614 |
+
log_level=logger.getEffectiveLevel(),
|
| 615 |
+
total=expected_size,
|
| 616 |
+
initial=0,
|
| 617 |
+
name="huggingface_hub.xet_get",
|
| 618 |
+
_tqdm_bar=_tqdm_bar,
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
with progress_cm as progress:
|
| 622 |
+
|
| 623 |
+
def progress_updater(progress_bytes: float):
|
| 624 |
+
progress.update(progress_bytes)
|
| 625 |
+
|
| 626 |
+
download_files(
|
| 627 |
+
xet_download_info,
|
| 628 |
+
endpoint=connection_info.endpoint,
|
| 629 |
+
token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
|
| 630 |
+
token_refresher=token_refresher,
|
| 631 |
+
progress_updater=[progress_updater],
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
def _normalize_etag(etag: Optional[str]) -> Optional[str]:
|
| 636 |
+
"""Normalize ETag HTTP header, so it can be used to create nice filepaths.
|
| 637 |
+
|
| 638 |
+
The HTTP spec allows two forms of ETag:
|
| 639 |
+
ETag: W/"<etag_value>"
|
| 640 |
+
ETag: "<etag_value>"
|
| 641 |
+
|
| 642 |
+
For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
|
| 643 |
+
more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
|
| 644 |
+
|
| 645 |
+
Args:
|
| 646 |
+
etag (`str`, *optional*): HTTP header
|
| 647 |
+
|
| 648 |
+
Returns:
|
| 649 |
+
`str` or `None`: string that can be used as a nice directory name.
|
| 650 |
+
Returns `None` if input is None.
|
| 651 |
+
"""
|
| 652 |
+
if etag is None:
|
| 653 |
+
return None
|
| 654 |
+
return etag.lstrip("W/").strip('"')
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None:
|
| 658 |
+
"""Alias method used in `transformers` conversion script."""
|
| 659 |
+
return _create_symlink(src=src, dst=dst, new_blob=new_blob)
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
|
| 663 |
+
"""Create a symbolic link named dst pointing to src.
|
| 664 |
+
|
| 665 |
+
By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
|
| 666 |
+
- If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
|
| 667 |
+
not break.
|
| 668 |
+
- Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
|
| 669 |
+
changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
|
| 670 |
+
https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
|
| 671 |
+
NOTE: The issue with absolute paths doesn't happen on admin mode.
|
| 672 |
+
When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
|
| 673 |
+
This happens when paths are not on the same volume. In that case, we use absolute paths.
|
| 674 |
+
|
| 675 |
+
|
| 676 |
+
The result layout looks something like
|
| 677 |
+
└── [ 128] snapshots
|
| 678 |
+
├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
|
| 679 |
+
│ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
|
| 680 |
+
│ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
|
| 681 |
+
|
| 682 |
+
If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
|
| 683 |
+
having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
|
| 684 |
+
(`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
|
| 685 |
+
cache, the file is duplicated on the disk.
|
| 686 |
+
|
| 687 |
+
In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
|
| 688 |
+
The warning message can be disabled with the `DISABLE_SYMLINKS_WARNING` environment variable.
|
| 689 |
+
"""
|
| 690 |
+
try:
|
| 691 |
+
os.remove(dst)
|
| 692 |
+
except OSError:
|
| 693 |
+
pass
|
| 694 |
+
|
| 695 |
+
abs_src = os.path.abspath(os.path.expanduser(src))
|
| 696 |
+
abs_dst = os.path.abspath(os.path.expanduser(dst))
|
| 697 |
+
abs_dst_folder = os.path.dirname(abs_dst)
|
| 698 |
+
|
| 699 |
+
# Use relative_dst in priority
|
| 700 |
+
try:
|
| 701 |
+
relative_src = os.path.relpath(abs_src, abs_dst_folder)
|
| 702 |
+
except ValueError:
|
| 703 |
+
# Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
|
| 704 |
+
# local_dir instead of within the cache directory.
|
| 705 |
+
# See https://docs.python.org/3/library/os.path.html#os.path.relpath
|
| 706 |
+
relative_src = None
|
| 707 |
+
|
| 708 |
+
try:
|
| 709 |
+
commonpath = os.path.commonpath([abs_src, abs_dst])
|
| 710 |
+
_support_symlinks = are_symlinks_supported(commonpath)
|
| 711 |
+
except ValueError:
|
| 712 |
+
# Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
|
| 713 |
+
# See https://docs.python.org/3/library/os.path.html#os.path.commonpath
|
| 714 |
+
_support_symlinks = os.name != "nt"
|
| 715 |
+
except PermissionError:
|
| 716 |
+
# Permission error means src and dst are not in the same volume (e.g. destination path has been provided
|
| 717 |
+
# by the user via `local_dir`. Let's test symlink support there)
|
| 718 |
+
_support_symlinks = are_symlinks_supported(abs_dst_folder)
|
| 719 |
+
except OSError as e:
|
| 720 |
+
# OS error (errno=30) means that the commonpath is readonly on Linux/MacOS.
|
| 721 |
+
if e.errno == errno.EROFS:
|
| 722 |
+
_support_symlinks = are_symlinks_supported(abs_dst_folder)
|
| 723 |
+
else:
|
| 724 |
+
raise
|
| 725 |
+
|
| 726 |
+
# Symlinks are supported => let's create a symlink.
|
| 727 |
+
if _support_symlinks:
|
| 728 |
+
src_rel_or_abs = relative_src or abs_src
|
| 729 |
+
logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
|
| 730 |
+
try:
|
| 731 |
+
os.symlink(src_rel_or_abs, abs_dst)
|
| 732 |
+
return
|
| 733 |
+
except FileExistsError:
|
| 734 |
+
if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
|
| 735 |
+
# `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
|
| 736 |
+
# been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
|
| 737 |
+
return
|
| 738 |
+
else:
|
| 739 |
+
# Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
|
| 740 |
+
# `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
|
| 741 |
+
raise
|
| 742 |
+
except PermissionError:
|
| 743 |
+
# Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
|
| 744 |
+
# is supported on both volumes but not between them. Let's just make a hard copy in that case.
|
| 745 |
+
pass
|
| 746 |
+
|
| 747 |
+
# Symlinks are not supported => let's move or copy the file.
|
| 748 |
+
if new_blob:
|
| 749 |
+
logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
|
| 750 |
+
shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
|
| 751 |
+
else:
|
| 752 |
+
logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
|
| 753 |
+
shutil.copyfile(abs_src, abs_dst)
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
|
| 757 |
+
"""Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
|
| 758 |
+
|
| 759 |
+
Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
|
| 760 |
+
"""
|
| 761 |
+
if revision != commit_hash:
|
| 762 |
+
ref_path = Path(storage_folder) / "refs" / revision
|
| 763 |
+
ref_path.parent.mkdir(parents=True, exist_ok=True)
|
| 764 |
+
if not ref_path.exists() or commit_hash != ref_path.read_text():
|
| 765 |
+
# Update ref only if has been updated. Could cause useless error in case
|
| 766 |
+
# repo is already cached and user doesn't have write access to cache folder.
|
| 767 |
+
# See https://github.com/huggingface/huggingface_hub/issues/1216.
|
| 768 |
+
ref_path.write_text(commit_hash)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
@validate_hf_hub_args
|
| 772 |
+
def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
|
| 773 |
+
"""Return a serialized version of a hf.co repo name and type, safe for disk storage
|
| 774 |
+
as a single non-nested folder.
|
| 775 |
+
|
| 776 |
+
Example: models--julien-c--EsperBERTo-small
|
| 777 |
+
"""
|
| 778 |
+
# remove all `/` occurrences to correctly convert repo to directory name
|
| 779 |
+
parts = [f"{repo_type}s", *repo_id.split("/")]
|
| 780 |
+
return constants.REPO_ID_SEPARATOR.join(parts)
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
|
| 784 |
+
"""Check disk usage and log a warning if there is not enough disk space to download the file.
|
| 785 |
+
|
| 786 |
+
Args:
|
| 787 |
+
expected_size (`int`):
|
| 788 |
+
The expected size of the file in bytes.
|
| 789 |
+
target_dir (`str`):
|
| 790 |
+
The directory where the file will be stored after downloading.
|
| 791 |
+
"""
|
| 792 |
+
|
| 793 |
+
target_dir = Path(target_dir) # format as `Path`
|
| 794 |
+
for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one
|
| 795 |
+
try:
|
| 796 |
+
target_dir_free = shutil.disk_usage(path).free
|
| 797 |
+
if target_dir_free < expected_size:
|
| 798 |
+
warnings.warn(
|
| 799 |
+
"Not enough free disk space to download the file. "
|
| 800 |
+
f"The expected file size is: {expected_size / 1e6:.2f} MB. "
|
| 801 |
+
f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
|
| 802 |
+
)
|
| 803 |
+
return
|
| 804 |
+
except OSError: # raise on anything: file does not exist or space disk cannot be checked
|
| 805 |
+
pass
|
| 806 |
+
|
| 807 |
+
|
| 808 |
+
@validate_hf_hub_args
|
| 809 |
+
def hf_hub_download(
|
| 810 |
+
repo_id: str,
|
| 811 |
+
filename: str,
|
| 812 |
+
*,
|
| 813 |
+
subfolder: Optional[str] = None,
|
| 814 |
+
repo_type: Optional[str] = None,
|
| 815 |
+
revision: Optional[str] = None,
|
| 816 |
+
library_name: Optional[str] = None,
|
| 817 |
+
library_version: Optional[str] = None,
|
| 818 |
+
cache_dir: Union[str, Path, None] = None,
|
| 819 |
+
local_dir: Union[str, Path, None] = None,
|
| 820 |
+
user_agent: Union[Dict, str, None] = None,
|
| 821 |
+
force_download: bool = False,
|
| 822 |
+
proxies: Optional[Dict] = None,
|
| 823 |
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
| 824 |
+
token: Union[bool, str, None] = None,
|
| 825 |
+
local_files_only: bool = False,
|
| 826 |
+
headers: Optional[Dict[str, str]] = None,
|
| 827 |
+
endpoint: Optional[str] = None,
|
| 828 |
+
resume_download: Optional[bool] = None,
|
| 829 |
+
force_filename: Optional[str] = None,
|
| 830 |
+
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
| 831 |
+
) -> str:
|
| 832 |
+
"""Download a given file if it's not already present in the local cache.
|
| 833 |
+
|
| 834 |
+
The new cache file layout looks like this:
|
| 835 |
+
- The cache directory contains one subfolder per repo_id (namespaced by repo type)
|
| 836 |
+
- inside each repo folder:
|
| 837 |
+
- refs is a list of the latest known revision => commit_hash pairs
|
| 838 |
+
- blobs contains the actual file blobs (identified by their git-sha or sha256, depending on
|
| 839 |
+
whether they're LFS files or not)
|
| 840 |
+
- snapshots contains one subfolder per commit, each "commit" contains the subset of the files
|
| 841 |
+
that have been resolved at that particular commit. Each filename is a symlink to the blob
|
| 842 |
+
at that particular commit.
|
| 843 |
+
|
| 844 |
+
```
|
| 845 |
+
[ 96] .
|
| 846 |
+
└── [ 160] models--julien-c--EsperBERTo-small
|
| 847 |
+
├── [ 160] blobs
|
| 848 |
+
│ ├── [321M] 403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
|
| 849 |
+
│ ├── [ 398] 7cb18dc9bafbfcf74629a4b760af1b160957a83e
|
| 850 |
+
│ └── [1.4K] d7edf6bd2a681fb0175f7735299831ee1b22b812
|
| 851 |
+
├── [ 96] refs
|
| 852 |
+
│ └── [ 40] main
|
| 853 |
+
└── [ 128] snapshots
|
| 854 |
+
├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
|
| 855 |
+
│ ├── [ 52] README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
|
| 856 |
+
│ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
|
| 857 |
+
└── [ 128] bbc77c8132af1cc5cf678da3f1ddf2de43606d48
|
| 858 |
+
├── [ 52] README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
|
| 859 |
+
└── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
|
| 860 |
+
```
|
| 861 |
+
|
| 862 |
+
If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
|
| 863 |
+
option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
|
| 864 |
+
to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
|
| 865 |
+
cache-system, it's optimized for regularly pulling the latest version of a repository.
|
| 866 |
+
|
| 867 |
+
Args:
|
| 868 |
+
repo_id (`str`):
|
| 869 |
+
A user or an organization name and a repo name separated by a `/`.
|
| 870 |
+
filename (`str`):
|
| 871 |
+
The name of the file in the repo.
|
| 872 |
+
subfolder (`str`, *optional*):
|
| 873 |
+
An optional value corresponding to a folder inside the model repo.
|
| 874 |
+
repo_type (`str`, *optional*):
|
| 875 |
+
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
| 876 |
+
`None` or `"model"` if downloading from a model. Default is `None`.
|
| 877 |
+
revision (`str`, *optional*):
|
| 878 |
+
An optional Git revision id which can be a branch name, a tag, or a
|
| 879 |
+
commit hash.
|
| 880 |
+
library_name (`str`, *optional*):
|
| 881 |
+
The name of the library to which the object corresponds.
|
| 882 |
+
library_version (`str`, *optional*):
|
| 883 |
+
The version of the library.
|
| 884 |
+
cache_dir (`str`, `Path`, *optional*):
|
| 885 |
+
Path to the folder where cached files are stored.
|
| 886 |
+
local_dir (`str` or `Path`, *optional*):
|
| 887 |
+
If provided, the downloaded file will be placed under this directory.
|
| 888 |
+
user_agent (`dict`, `str`, *optional*):
|
| 889 |
+
The user-agent info in the form of a dictionary or a string.
|
| 890 |
+
force_download (`bool`, *optional*, defaults to `False`):
|
| 891 |
+
Whether the file should be downloaded even if it already exists in
|
| 892 |
+
the local cache.
|
| 893 |
+
proxies (`dict`, *optional*):
|
| 894 |
+
Dictionary mapping protocol to the URL of the proxy passed to
|
| 895 |
+
`requests.request`.
|
| 896 |
+
etag_timeout (`float`, *optional*, defaults to `10`):
|
| 897 |
+
When fetching ETag, how many seconds to wait for the server to send
|
| 898 |
+
data before giving up which is passed to `requests.request`.
|
| 899 |
+
token (`str`, `bool`, *optional*):
|
| 900 |
+
A token to be used for the download.
|
| 901 |
+
- If `True`, the token is read from the HuggingFace config
|
| 902 |
+
folder.
|
| 903 |
+
- If a string, it's used as the authentication token.
|
| 904 |
+
local_files_only (`bool`, *optional*, defaults to `False`):
|
| 905 |
+
If `True`, avoid downloading the file and return the path to the
|
| 906 |
+
local cached file if it exists.
|
| 907 |
+
headers (`dict`, *optional*):
|
| 908 |
+
Additional headers to be sent with the request.
|
| 909 |
+
|
| 910 |
+
Returns:
|
| 911 |
+
`str`: Local path of file or if networking is off, last version of file cached on disk.
|
| 912 |
+
|
| 913 |
+
Raises:
|
| 914 |
+
[`~utils.RepositoryNotFoundError`]
|
| 915 |
+
If the repository to download from cannot be found. This may be because it doesn't exist,
|
| 916 |
+
or because it is set to `private` and you do not have access.
|
| 917 |
+
[`~utils.RevisionNotFoundError`]
|
| 918 |
+
If the revision to download from cannot be found.
|
| 919 |
+
[`~utils.EntryNotFoundError`]
|
| 920 |
+
If the file to download cannot be found.
|
| 921 |
+
[`~utils.LocalEntryNotFoundError`]
|
| 922 |
+
If network is disabled or unavailable and file is not found in cache.
|
| 923 |
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
| 924 |
+
If `token=True` but the token cannot be found.
|
| 925 |
+
[`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
|
| 926 |
+
If ETag cannot be determined.
|
| 927 |
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
| 928 |
+
If some parameter value is invalid.
|
| 929 |
+
|
| 930 |
+
"""
|
| 931 |
+
if constants.HF_HUB_ETAG_TIMEOUT != constants.DEFAULT_ETAG_TIMEOUT:
|
| 932 |
+
# Respect environment variable above user value
|
| 933 |
+
etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
|
| 934 |
+
|
| 935 |
+
if force_filename is not None:
|
| 936 |
+
warnings.warn(
|
| 937 |
+
"The `force_filename` parameter is deprecated as a new caching system, "
|
| 938 |
+
"which keeps the filenames as they are on the Hub, is now in place.",
|
| 939 |
+
FutureWarning,
|
| 940 |
+
)
|
| 941 |
+
if resume_download is not None:
|
| 942 |
+
warnings.warn(
|
| 943 |
+
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
| 944 |
+
"Downloads always resume when possible. "
|
| 945 |
+
"If you want to force a new download, use `force_download=True`.",
|
| 946 |
+
FutureWarning,
|
| 947 |
+
)
|
| 948 |
+
|
| 949 |
+
if cache_dir is None:
|
| 950 |
+
cache_dir = constants.HF_HUB_CACHE
|
| 951 |
+
if revision is None:
|
| 952 |
+
revision = constants.DEFAULT_REVISION
|
| 953 |
+
if isinstance(cache_dir, Path):
|
| 954 |
+
cache_dir = str(cache_dir)
|
| 955 |
+
if isinstance(local_dir, Path):
|
| 956 |
+
local_dir = str(local_dir)
|
| 957 |
+
|
| 958 |
+
if subfolder == "":
|
| 959 |
+
subfolder = None
|
| 960 |
+
if subfolder is not None:
|
| 961 |
+
# This is used to create a URL, and not a local path, hence the forward slash.
|
| 962 |
+
filename = f"{subfolder}/{filename}"
|
| 963 |
+
|
| 964 |
+
if repo_type is None:
|
| 965 |
+
repo_type = "model"
|
| 966 |
+
if repo_type not in constants.REPO_TYPES:
|
| 967 |
+
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
|
| 968 |
+
|
| 969 |
+
hf_headers = build_hf_headers(
|
| 970 |
+
token=token,
|
| 971 |
+
library_name=library_name,
|
| 972 |
+
library_version=library_version,
|
| 973 |
+
user_agent=user_agent,
|
| 974 |
+
headers=headers,
|
| 975 |
+
)
|
| 976 |
+
|
| 977 |
+
if local_dir is not None:
|
| 978 |
+
if local_dir_use_symlinks != "auto":
|
| 979 |
+
warnings.warn(
|
| 980 |
+
"`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
|
| 981 |
+
"The process to download files to a local folder has been updated and do "
|
| 982 |
+
"not rely on symlinks anymore. You only need to pass a destination folder "
|
| 983 |
+
"as`local_dir`.\n"
|
| 984 |
+
"For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
|
| 985 |
+
)
|
| 986 |
+
|
| 987 |
+
return _hf_hub_download_to_local_dir(
|
| 988 |
+
# Destination
|
| 989 |
+
local_dir=local_dir,
|
| 990 |
+
# File info
|
| 991 |
+
repo_id=repo_id,
|
| 992 |
+
repo_type=repo_type,
|
| 993 |
+
filename=filename,
|
| 994 |
+
revision=revision,
|
| 995 |
+
# HTTP info
|
| 996 |
+
endpoint=endpoint,
|
| 997 |
+
etag_timeout=etag_timeout,
|
| 998 |
+
headers=hf_headers,
|
| 999 |
+
proxies=proxies,
|
| 1000 |
+
token=token,
|
| 1001 |
+
# Additional options
|
| 1002 |
+
cache_dir=cache_dir,
|
| 1003 |
+
force_download=force_download,
|
| 1004 |
+
local_files_only=local_files_only,
|
| 1005 |
+
)
|
| 1006 |
+
else:
|
| 1007 |
+
return _hf_hub_download_to_cache_dir(
|
| 1008 |
+
# Destination
|
| 1009 |
+
cache_dir=cache_dir,
|
| 1010 |
+
# File info
|
| 1011 |
+
repo_id=repo_id,
|
| 1012 |
+
filename=filename,
|
| 1013 |
+
repo_type=repo_type,
|
| 1014 |
+
revision=revision,
|
| 1015 |
+
# HTTP info
|
| 1016 |
+
endpoint=endpoint,
|
| 1017 |
+
etag_timeout=etag_timeout,
|
| 1018 |
+
headers=hf_headers,
|
| 1019 |
+
proxies=proxies,
|
| 1020 |
+
token=token,
|
| 1021 |
+
# Additional options
|
| 1022 |
+
local_files_only=local_files_only,
|
| 1023 |
+
force_download=force_download,
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
|
| 1027 |
+
def _hf_hub_download_to_cache_dir(
|
| 1028 |
+
*,
|
| 1029 |
+
# Destination
|
| 1030 |
+
cache_dir: str,
|
| 1031 |
+
# File info
|
| 1032 |
+
repo_id: str,
|
| 1033 |
+
filename: str,
|
| 1034 |
+
repo_type: str,
|
| 1035 |
+
revision: str,
|
| 1036 |
+
# HTTP info
|
| 1037 |
+
endpoint: Optional[str],
|
| 1038 |
+
etag_timeout: float,
|
| 1039 |
+
headers: Dict[str, str],
|
| 1040 |
+
proxies: Optional[Dict],
|
| 1041 |
+
token: Optional[Union[bool, str]],
|
| 1042 |
+
# Additional options
|
| 1043 |
+
local_files_only: bool,
|
| 1044 |
+
force_download: bool,
|
| 1045 |
+
) -> str:
|
| 1046 |
+
"""Download a given file to a cache folder, if not already present.
|
| 1047 |
+
|
| 1048 |
+
Method should not be called directly. Please use `hf_hub_download` instead.
|
| 1049 |
+
"""
|
| 1050 |
+
locks_dir = os.path.join(cache_dir, ".locks")
|
| 1051 |
+
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
| 1052 |
+
|
| 1053 |
+
# cross platform transcription of filename, to be used as a local file path.
|
| 1054 |
+
relative_filename = os.path.join(*filename.split("/"))
|
| 1055 |
+
if os.name == "nt":
|
| 1056 |
+
if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
|
| 1057 |
+
raise ValueError(
|
| 1058 |
+
f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
|
| 1059 |
+
" owner to rename this file."
|
| 1060 |
+
)
|
| 1061 |
+
|
| 1062 |
+
# if user provides a commit_hash and they already have the file on disk, shortcut everything.
|
| 1063 |
+
if REGEX_COMMIT_HASH.match(revision):
|
| 1064 |
+
pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
|
| 1065 |
+
if os.path.exists(pointer_path) and not force_download:
|
| 1066 |
+
return pointer_path
|
| 1067 |
+
|
| 1068 |
+
# Try to get metadata (etag, commit_hash, url, size) from the server.
|
| 1069 |
+
# If we can't, a HEAD request error is returned.
|
| 1070 |
+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
| 1071 |
+
repo_id=repo_id,
|
| 1072 |
+
filename=filename,
|
| 1073 |
+
repo_type=repo_type,
|
| 1074 |
+
revision=revision,
|
| 1075 |
+
endpoint=endpoint,
|
| 1076 |
+
proxies=proxies,
|
| 1077 |
+
etag_timeout=etag_timeout,
|
| 1078 |
+
headers=headers,
|
| 1079 |
+
token=token,
|
| 1080 |
+
local_files_only=local_files_only,
|
| 1081 |
+
storage_folder=storage_folder,
|
| 1082 |
+
relative_filename=relative_filename,
|
| 1083 |
+
)
|
| 1084 |
+
|
| 1085 |
+
# etag can be None for several reasons:
|
| 1086 |
+
# 1. we passed local_files_only.
|
| 1087 |
+
# 2. we don't have a connection
|
| 1088 |
+
# 3. Hub is down (HTTP 500, 503, 504)
|
| 1089 |
+
# 4. repo is not found -for example private or gated- and invalid/missing token sent
|
| 1090 |
+
# 5. Hub is blocked by a firewall or proxy is not set correctly.
|
| 1091 |
+
# => Try to get the last downloaded one from the specified revision.
|
| 1092 |
+
#
|
| 1093 |
+
# If the specified revision is a commit hash, look inside "snapshots".
|
| 1094 |
+
# If the specified revision is a branch or tag, look inside "refs".
|
| 1095 |
+
if head_call_error is not None:
|
| 1096 |
+
# Couldn't make a HEAD call => let's try to find a local file
|
| 1097 |
+
if not force_download:
|
| 1098 |
+
commit_hash = None
|
| 1099 |
+
if REGEX_COMMIT_HASH.match(revision):
|
| 1100 |
+
commit_hash = revision
|
| 1101 |
+
else:
|
| 1102 |
+
ref_path = os.path.join(storage_folder, "refs", revision)
|
| 1103 |
+
if os.path.isfile(ref_path):
|
| 1104 |
+
with open(ref_path) as f:
|
| 1105 |
+
commit_hash = f.read()
|
| 1106 |
+
|
| 1107 |
+
# Return pointer file if exists
|
| 1108 |
+
if commit_hash is not None:
|
| 1109 |
+
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
| 1110 |
+
if os.path.exists(pointer_path) and not force_download:
|
| 1111 |
+
return pointer_path
|
| 1112 |
+
|
| 1113 |
+
# Otherwise, raise appropriate error
|
| 1114 |
+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
| 1115 |
+
|
| 1116 |
+
# From now on, etag, commit_hash, url and size are not None.
|
| 1117 |
+
assert etag is not None, "etag must have been retrieved from server"
|
| 1118 |
+
assert commit_hash is not None, "commit_hash must have been retrieved from server"
|
| 1119 |
+
assert url_to_download is not None, "file location must have been retrieved from server"
|
| 1120 |
+
assert expected_size is not None, "expected_size must have been retrieved from server"
|
| 1121 |
+
blob_path = os.path.join(storage_folder, "blobs", etag)
|
| 1122 |
+
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
| 1123 |
+
|
| 1124 |
+
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
| 1125 |
+
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
| 1126 |
+
|
| 1127 |
+
# if passed revision is not identical to commit_hash
|
| 1128 |
+
# then revision has to be a branch name or tag name.
|
| 1129 |
+
# In that case store a ref.
|
| 1130 |
+
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
| 1131 |
+
|
| 1132 |
+
# Prevent parallel downloads of the same file with a lock.
|
| 1133 |
+
# etag could be duplicated across repos,
|
| 1134 |
+
lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
|
| 1135 |
+
|
| 1136 |
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
| 1137 |
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
| 1138 |
+
if (
|
| 1139 |
+
os.name == "nt"
|
| 1140 |
+
and len(os.path.abspath(lock_path)) > 255
|
| 1141 |
+
and not os.path.abspath(lock_path).startswith("\\\\?\\")
|
| 1142 |
+
):
|
| 1143 |
+
lock_path = "\\\\?\\" + os.path.abspath(lock_path)
|
| 1144 |
+
|
| 1145 |
+
if (
|
| 1146 |
+
os.name == "nt"
|
| 1147 |
+
and len(os.path.abspath(blob_path)) > 255
|
| 1148 |
+
and not os.path.abspath(blob_path).startswith("\\\\?\\")
|
| 1149 |
+
):
|
| 1150 |
+
blob_path = "\\\\?\\" + os.path.abspath(blob_path)
|
| 1151 |
+
|
| 1152 |
+
Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
|
| 1153 |
+
|
| 1154 |
+
# pointer already exists -> immediate return
|
| 1155 |
+
if not force_download and os.path.exists(pointer_path):
|
| 1156 |
+
return pointer_path
|
| 1157 |
+
|
| 1158 |
+
# Blob exists but pointer must be (safely) created -> take the lock
|
| 1159 |
+
if not force_download and os.path.exists(blob_path):
|
| 1160 |
+
with WeakFileLock(lock_path):
|
| 1161 |
+
if not os.path.exists(pointer_path):
|
| 1162 |
+
_create_symlink(blob_path, pointer_path, new_blob=False)
|
| 1163 |
+
return pointer_path
|
| 1164 |
+
|
| 1165 |
+
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
| 1166 |
+
|
| 1167 |
+
with WeakFileLock(lock_path):
|
| 1168 |
+
_download_to_tmp_and_move(
|
| 1169 |
+
incomplete_path=Path(blob_path + ".incomplete"),
|
| 1170 |
+
destination_path=Path(blob_path),
|
| 1171 |
+
url_to_download=url_to_download,
|
| 1172 |
+
proxies=proxies,
|
| 1173 |
+
headers=headers,
|
| 1174 |
+
expected_size=expected_size,
|
| 1175 |
+
filename=filename,
|
| 1176 |
+
force_download=force_download,
|
| 1177 |
+
etag=etag,
|
| 1178 |
+
xet_file_data=xet_file_data,
|
| 1179 |
+
)
|
| 1180 |
+
if not os.path.exists(pointer_path):
|
| 1181 |
+
_create_symlink(blob_path, pointer_path, new_blob=True)
|
| 1182 |
+
|
| 1183 |
+
return pointer_path
|
| 1184 |
+
|
| 1185 |
+
|
| 1186 |
+
def _hf_hub_download_to_local_dir(
|
| 1187 |
+
*,
|
| 1188 |
+
# Destination
|
| 1189 |
+
local_dir: Union[str, Path],
|
| 1190 |
+
# File info
|
| 1191 |
+
repo_id: str,
|
| 1192 |
+
repo_type: str,
|
| 1193 |
+
filename: str,
|
| 1194 |
+
revision: str,
|
| 1195 |
+
# HTTP info
|
| 1196 |
+
endpoint: Optional[str],
|
| 1197 |
+
etag_timeout: float,
|
| 1198 |
+
headers: Dict[str, str],
|
| 1199 |
+
proxies: Optional[Dict],
|
| 1200 |
+
token: Union[bool, str, None],
|
| 1201 |
+
# Additional options
|
| 1202 |
+
cache_dir: str,
|
| 1203 |
+
force_download: bool,
|
| 1204 |
+
local_files_only: bool,
|
| 1205 |
+
) -> str:
|
| 1206 |
+
"""Download a given file to a local folder, if not already present.
|
| 1207 |
+
|
| 1208 |
+
Method should not be called directly. Please use `hf_hub_download` instead.
|
| 1209 |
+
"""
|
| 1210 |
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
| 1211 |
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
| 1212 |
+
if os.name == "nt" and len(os.path.abspath(local_dir)) > 255:
|
| 1213 |
+
local_dir = "\\\\?\\" + os.path.abspath(local_dir)
|
| 1214 |
+
local_dir = Path(local_dir)
|
| 1215 |
+
paths = get_local_download_paths(local_dir=local_dir, filename=filename)
|
| 1216 |
+
local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
|
| 1217 |
+
|
| 1218 |
+
# Local file exists + metadata exists + commit_hash matches => return file
|
| 1219 |
+
if (
|
| 1220 |
+
not force_download
|
| 1221 |
+
and REGEX_COMMIT_HASH.match(revision)
|
| 1222 |
+
and paths.file_path.is_file()
|
| 1223 |
+
and local_metadata is not None
|
| 1224 |
+
and local_metadata.commit_hash == revision
|
| 1225 |
+
):
|
| 1226 |
+
return str(paths.file_path)
|
| 1227 |
+
|
| 1228 |
+
# Local file doesn't exist or commit_hash doesn't match => we need the etag
|
| 1229 |
+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
| 1230 |
+
repo_id=repo_id,
|
| 1231 |
+
filename=filename,
|
| 1232 |
+
repo_type=repo_type,
|
| 1233 |
+
revision=revision,
|
| 1234 |
+
endpoint=endpoint,
|
| 1235 |
+
proxies=proxies,
|
| 1236 |
+
etag_timeout=etag_timeout,
|
| 1237 |
+
headers=headers,
|
| 1238 |
+
token=token,
|
| 1239 |
+
local_files_only=local_files_only,
|
| 1240 |
+
)
|
| 1241 |
+
|
| 1242 |
+
if head_call_error is not None:
|
| 1243 |
+
# No HEAD call but local file exists => default to local file
|
| 1244 |
+
if not force_download and paths.file_path.is_file():
|
| 1245 |
+
logger.warning(
|
| 1246 |
+
f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
|
| 1247 |
+
)
|
| 1248 |
+
return str(paths.file_path)
|
| 1249 |
+
# Otherwise => raise
|
| 1250 |
+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
| 1251 |
+
|
| 1252 |
+
# From now on, etag, commit_hash, url and size are not None.
|
| 1253 |
+
assert etag is not None, "etag must have been retrieved from server"
|
| 1254 |
+
assert commit_hash is not None, "commit_hash must have been retrieved from server"
|
| 1255 |
+
assert url_to_download is not None, "file location must have been retrieved from server"
|
| 1256 |
+
assert expected_size is not None, "expected_size must have been retrieved from server"
|
| 1257 |
+
|
| 1258 |
+
# Local file exists => check if it's up-to-date
|
| 1259 |
+
if not force_download and paths.file_path.is_file():
|
| 1260 |
+
# etag matches => update metadata and return file
|
| 1261 |
+
if local_metadata is not None and local_metadata.etag == etag:
|
| 1262 |
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
| 1263 |
+
return str(paths.file_path)
|
| 1264 |
+
|
| 1265 |
+
# metadata is outdated + etag is a sha256
|
| 1266 |
+
# => means it's an LFS file (large)
|
| 1267 |
+
# => let's compute local hash and compare
|
| 1268 |
+
# => if match, update metadata and return file
|
| 1269 |
+
if local_metadata is None and REGEX_SHA256.match(etag) is not None:
|
| 1270 |
+
with open(paths.file_path, "rb") as f:
|
| 1271 |
+
file_hash = sha_fileobj(f).hex()
|
| 1272 |
+
if file_hash == etag:
|
| 1273 |
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
| 1274 |
+
return str(paths.file_path)
|
| 1275 |
+
|
| 1276 |
+
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
| 1277 |
+
|
| 1278 |
+
# If we are lucky enough, the file is already in the cache => copy it
|
| 1279 |
+
if not force_download:
|
| 1280 |
+
cached_path = try_to_load_from_cache(
|
| 1281 |
+
repo_id=repo_id,
|
| 1282 |
+
filename=filename,
|
| 1283 |
+
cache_dir=cache_dir,
|
| 1284 |
+
revision=commit_hash,
|
| 1285 |
+
repo_type=repo_type,
|
| 1286 |
+
)
|
| 1287 |
+
if isinstance(cached_path, str):
|
| 1288 |
+
with WeakFileLock(paths.lock_path):
|
| 1289 |
+
paths.file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 1290 |
+
shutil.copyfile(cached_path, paths.file_path)
|
| 1291 |
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
| 1292 |
+
return str(paths.file_path)
|
| 1293 |
+
|
| 1294 |
+
# Otherwise, let's download the file!
|
| 1295 |
+
with WeakFileLock(paths.lock_path):
|
| 1296 |
+
paths.file_path.unlink(missing_ok=True) # delete outdated file first
|
| 1297 |
+
_download_to_tmp_and_move(
|
| 1298 |
+
incomplete_path=paths.incomplete_path(etag),
|
| 1299 |
+
destination_path=paths.file_path,
|
| 1300 |
+
url_to_download=url_to_download,
|
| 1301 |
+
proxies=proxies,
|
| 1302 |
+
headers=headers,
|
| 1303 |
+
expected_size=expected_size,
|
| 1304 |
+
filename=filename,
|
| 1305 |
+
force_download=force_download,
|
| 1306 |
+
etag=etag,
|
| 1307 |
+
xet_file_data=xet_file_data,
|
| 1308 |
+
)
|
| 1309 |
+
|
| 1310 |
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
| 1311 |
+
return str(paths.file_path)
|
| 1312 |
+
|
| 1313 |
+
|
| 1314 |
+
@validate_hf_hub_args
|
| 1315 |
+
def try_to_load_from_cache(
|
| 1316 |
+
repo_id: str,
|
| 1317 |
+
filename: str,
|
| 1318 |
+
cache_dir: Union[str, Path, None] = None,
|
| 1319 |
+
revision: Optional[str] = None,
|
| 1320 |
+
repo_type: Optional[str] = None,
|
| 1321 |
+
) -> Union[str, _CACHED_NO_EXIST_T, None]:
|
| 1322 |
+
"""
|
| 1323 |
+
Explores the cache to return the latest cached file for a given revision if found.
|
| 1324 |
+
|
| 1325 |
+
This function will not raise any exception if the file in not cached.
|
| 1326 |
+
|
| 1327 |
+
Args:
|
| 1328 |
+
cache_dir (`str` or `os.PathLike`):
|
| 1329 |
+
The folder where the cached files lie.
|
| 1330 |
+
repo_id (`str`):
|
| 1331 |
+
The ID of the repo on huggingface.co.
|
| 1332 |
+
filename (`str`):
|
| 1333 |
+
The filename to look for inside `repo_id`.
|
| 1334 |
+
revision (`str`, *optional*):
|
| 1335 |
+
The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
|
| 1336 |
+
provided either.
|
| 1337 |
+
repo_type (`str`, *optional*):
|
| 1338 |
+
The type of the repository. Will default to `"model"`.
|
| 1339 |
+
|
| 1340 |
+
Returns:
|
| 1341 |
+
`Optional[str]` or `_CACHED_NO_EXIST`:
|
| 1342 |
+
Will return `None` if the file was not cached. Otherwise:
|
| 1343 |
+
- The exact path to the cached file if it's found in the cache
|
| 1344 |
+
- A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
|
| 1345 |
+
cached.
|
| 1346 |
+
|
| 1347 |
+
Example:
|
| 1348 |
+
|
| 1349 |
+
```python
|
| 1350 |
+
from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
|
| 1351 |
+
|
| 1352 |
+
filepath = try_to_load_from_cache()
|
| 1353 |
+
if isinstance(filepath, str):
|
| 1354 |
+
# file exists and is cached
|
| 1355 |
+
...
|
| 1356 |
+
elif filepath is _CACHED_NO_EXIST:
|
| 1357 |
+
# non-existence of file is cached
|
| 1358 |
+
...
|
| 1359 |
+
else:
|
| 1360 |
+
# file is not cached
|
| 1361 |
+
...
|
| 1362 |
+
```
|
| 1363 |
+
"""
|
| 1364 |
+
if revision is None:
|
| 1365 |
+
revision = "main"
|
| 1366 |
+
if repo_type is None:
|
| 1367 |
+
repo_type = "model"
|
| 1368 |
+
if repo_type not in constants.REPO_TYPES:
|
| 1369 |
+
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
|
| 1370 |
+
if cache_dir is None:
|
| 1371 |
+
cache_dir = constants.HF_HUB_CACHE
|
| 1372 |
+
|
| 1373 |
+
object_id = repo_id.replace("/", "--")
|
| 1374 |
+
repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
|
| 1375 |
+
if not os.path.isdir(repo_cache):
|
| 1376 |
+
# No cache for this model
|
| 1377 |
+
return None
|
| 1378 |
+
|
| 1379 |
+
refs_dir = os.path.join(repo_cache, "refs")
|
| 1380 |
+
snapshots_dir = os.path.join(repo_cache, "snapshots")
|
| 1381 |
+
no_exist_dir = os.path.join(repo_cache, ".no_exist")
|
| 1382 |
+
|
| 1383 |
+
# Resolve refs (for instance to convert main to the associated commit sha)
|
| 1384 |
+
if os.path.isdir(refs_dir):
|
| 1385 |
+
revision_file = os.path.join(refs_dir, revision)
|
| 1386 |
+
if os.path.isfile(revision_file):
|
| 1387 |
+
with open(revision_file) as f:
|
| 1388 |
+
revision = f.read()
|
| 1389 |
+
|
| 1390 |
+
# Check if file is cached as "no_exist"
|
| 1391 |
+
if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
|
| 1392 |
+
return _CACHED_NO_EXIST
|
| 1393 |
+
|
| 1394 |
+
# Check if revision folder exists
|
| 1395 |
+
if not os.path.exists(snapshots_dir):
|
| 1396 |
+
return None
|
| 1397 |
+
cached_shas = os.listdir(snapshots_dir)
|
| 1398 |
+
if revision not in cached_shas:
|
| 1399 |
+
# No cache for this revision and we won't try to return a random revision
|
| 1400 |
+
return None
|
| 1401 |
+
|
| 1402 |
+
# Check if file exists in cache
|
| 1403 |
+
cached_file = os.path.join(snapshots_dir, revision, filename)
|
| 1404 |
+
return cached_file if os.path.isfile(cached_file) else None
|
| 1405 |
+
|
| 1406 |
+
|
| 1407 |
+
@validate_hf_hub_args
|
| 1408 |
+
def get_hf_file_metadata(
|
| 1409 |
+
url: str,
|
| 1410 |
+
token: Union[bool, str, None] = None,
|
| 1411 |
+
proxies: Optional[Dict] = None,
|
| 1412 |
+
timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
|
| 1413 |
+
library_name: Optional[str] = None,
|
| 1414 |
+
library_version: Optional[str] = None,
|
| 1415 |
+
user_agent: Union[Dict, str, None] = None,
|
| 1416 |
+
headers: Optional[Dict[str, str]] = None,
|
| 1417 |
+
endpoint: Optional[str] = None,
|
| 1418 |
+
) -> HfFileMetadata:
|
| 1419 |
+
"""Fetch metadata of a file versioned on the Hub for a given url.
|
| 1420 |
+
|
| 1421 |
+
Args:
|
| 1422 |
+
url (`str`):
|
| 1423 |
+
File url, for example returned by [`hf_hub_url`].
|
| 1424 |
+
token (`str` or `bool`, *optional*):
|
| 1425 |
+
A token to be used for the download.
|
| 1426 |
+
- If `True`, the token is read from the HuggingFace config
|
| 1427 |
+
folder.
|
| 1428 |
+
- If `False` or `None`, no token is provided.
|
| 1429 |
+
- If a string, it's used as the authentication token.
|
| 1430 |
+
proxies (`dict`, *optional*):
|
| 1431 |
+
Dictionary mapping protocol to the URL of the proxy passed to
|
| 1432 |
+
`requests.request`.
|
| 1433 |
+
timeout (`float`, *optional*, defaults to 10):
|
| 1434 |
+
How many seconds to wait for the server to send metadata before giving up.
|
| 1435 |
+
library_name (`str`, *optional*):
|
| 1436 |
+
The name of the library to which the object corresponds.
|
| 1437 |
+
library_version (`str`, *optional*):
|
| 1438 |
+
The version of the library.
|
| 1439 |
+
user_agent (`dict`, `str`, *optional*):
|
| 1440 |
+
The user-agent info in the form of a dictionary or a string.
|
| 1441 |
+
headers (`dict`, *optional*):
|
| 1442 |
+
Additional headers to be sent with the request.
|
| 1443 |
+
endpoint (`str`, *optional*):
|
| 1444 |
+
Endpoint of the Hub. Defaults to <https://huggingface.co>.
|
| 1445 |
+
|
| 1446 |
+
Returns:
|
| 1447 |
+
A [`HfFileMetadata`] object containing metadata such as location, etag, size and
|
| 1448 |
+
commit_hash.
|
| 1449 |
+
"""
|
| 1450 |
+
hf_headers = build_hf_headers(
|
| 1451 |
+
token=token,
|
| 1452 |
+
library_name=library_name,
|
| 1453 |
+
library_version=library_version,
|
| 1454 |
+
user_agent=user_agent,
|
| 1455 |
+
headers=headers,
|
| 1456 |
+
)
|
| 1457 |
+
hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
|
| 1458 |
+
|
| 1459 |
+
# Retrieve metadata
|
| 1460 |
+
r = _request_wrapper(
|
| 1461 |
+
method="HEAD",
|
| 1462 |
+
url=url,
|
| 1463 |
+
headers=hf_headers,
|
| 1464 |
+
allow_redirects=False,
|
| 1465 |
+
follow_relative_redirects=True,
|
| 1466 |
+
proxies=proxies,
|
| 1467 |
+
timeout=timeout,
|
| 1468 |
+
)
|
| 1469 |
+
hf_raise_for_status(r)
|
| 1470 |
+
|
| 1471 |
+
# Return
|
| 1472 |
+
return HfFileMetadata(
|
| 1473 |
+
commit_hash=r.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
|
| 1474 |
+
# We favor a custom header indicating the etag of the linked resource, and
|
| 1475 |
+
# we fallback to the regular etag header.
|
| 1476 |
+
etag=_normalize_etag(r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")),
|
| 1477 |
+
# Either from response headers (if redirected) or defaults to request url
|
| 1478 |
+
# Do not use directly `url`, as `_request_wrapper` might have followed relative
|
| 1479 |
+
# redirects.
|
| 1480 |
+
location=r.headers.get("Location") or r.request.url, # type: ignore
|
| 1481 |
+
size=_int_or_none(
|
| 1482 |
+
r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")
|
| 1483 |
+
),
|
| 1484 |
+
xet_file_data=parse_xet_file_data_from_response(r, endpoint=endpoint), # type: ignore
|
| 1485 |
+
)
|
| 1486 |
+
|
| 1487 |
+
|
| 1488 |
+
def _get_metadata_or_catch_error(
|
| 1489 |
+
*,
|
| 1490 |
+
repo_id: str,
|
| 1491 |
+
filename: str,
|
| 1492 |
+
repo_type: str,
|
| 1493 |
+
revision: str,
|
| 1494 |
+
endpoint: Optional[str],
|
| 1495 |
+
proxies: Optional[Dict],
|
| 1496 |
+
etag_timeout: Optional[float],
|
| 1497 |
+
headers: Dict[str, str], # mutated inplace!
|
| 1498 |
+
token: Union[bool, str, None],
|
| 1499 |
+
local_files_only: bool,
|
| 1500 |
+
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
|
| 1501 |
+
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
|
| 1502 |
+
) -> Union[
|
| 1503 |
+
# Either an exception is caught and returned
|
| 1504 |
+
Tuple[None, None, None, None, None, Exception],
|
| 1505 |
+
# Or the metadata is returned as
|
| 1506 |
+
# `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
|
| 1507 |
+
Tuple[str, str, str, int, Optional[XetFileData], None],
|
| 1508 |
+
]:
|
| 1509 |
+
"""Get metadata for a file on the Hub, safely handling network issues.
|
| 1510 |
+
|
| 1511 |
+
Returns either the etag, commit_hash and expected size of the file, or the error
|
| 1512 |
+
raised while fetching the metadata.
|
| 1513 |
+
|
| 1514 |
+
NOTE: This function mutates `headers` inplace! It removes the `authorization` header
|
| 1515 |
+
if the file is a LFS blob and the domain of the url is different from the
|
| 1516 |
+
domain of the location (typically an S3 bucket).
|
| 1517 |
+
"""
|
| 1518 |
+
if local_files_only:
|
| 1519 |
+
return (
|
| 1520 |
+
None,
|
| 1521 |
+
None,
|
| 1522 |
+
None,
|
| 1523 |
+
None,
|
| 1524 |
+
None,
|
| 1525 |
+
OfflineModeIsEnabled(
|
| 1526 |
+
f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
|
| 1527 |
+
),
|
| 1528 |
+
)
|
| 1529 |
+
|
| 1530 |
+
url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
|
| 1531 |
+
url_to_download: str = url
|
| 1532 |
+
etag: Optional[str] = None
|
| 1533 |
+
commit_hash: Optional[str] = None
|
| 1534 |
+
expected_size: Optional[int] = None
|
| 1535 |
+
head_error_call: Optional[Exception] = None
|
| 1536 |
+
xet_file_data: Optional[XetFileData] = None
|
| 1537 |
+
|
| 1538 |
+
# Try to get metadata from the server.
|
| 1539 |
+
# Do not raise yet if the file is not found or not accessible.
|
| 1540 |
+
if not local_files_only:
|
| 1541 |
+
try:
|
| 1542 |
+
try:
|
| 1543 |
+
metadata = get_hf_file_metadata(
|
| 1544 |
+
url=url, proxies=proxies, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
|
| 1545 |
+
)
|
| 1546 |
+
except EntryNotFoundError as http_error:
|
| 1547 |
+
if storage_folder is not None and relative_filename is not None:
|
| 1548 |
+
# Cache the non-existence of the file
|
| 1549 |
+
commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
| 1550 |
+
if commit_hash is not None:
|
| 1551 |
+
no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
|
| 1552 |
+
try:
|
| 1553 |
+
no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 1554 |
+
no_exist_file_path.touch()
|
| 1555 |
+
except OSError as e:
|
| 1556 |
+
logger.error(
|
| 1557 |
+
f"Could not cache non-existence of file. Will ignore error and continue. Error: {e}"
|
| 1558 |
+
)
|
| 1559 |
+
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
| 1560 |
+
raise
|
| 1561 |
+
|
| 1562 |
+
# Commit hash must exist
|
| 1563 |
+
commit_hash = metadata.commit_hash
|
| 1564 |
+
if commit_hash is None:
|
| 1565 |
+
raise FileMetadataError(
|
| 1566 |
+
"Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
|
| 1567 |
+
" prevents you from downloading resources from https://huggingface.co. Please check your firewall"
|
| 1568 |
+
" and proxy settings and make sure your SSL certificates are updated."
|
| 1569 |
+
)
|
| 1570 |
+
|
| 1571 |
+
# Etag must exist
|
| 1572 |
+
# If we don't have any of those, raise an error.
|
| 1573 |
+
etag = metadata.etag
|
| 1574 |
+
if etag is None:
|
| 1575 |
+
raise FileMetadataError(
|
| 1576 |
+
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
|
| 1577 |
+
)
|
| 1578 |
+
|
| 1579 |
+
# Size must exist
|
| 1580 |
+
expected_size = metadata.size
|
| 1581 |
+
if expected_size is None:
|
| 1582 |
+
raise FileMetadataError("Distant resource does not have a Content-Length.")
|
| 1583 |
+
|
| 1584 |
+
xet_file_data = metadata.xet_file_data
|
| 1585 |
+
|
| 1586 |
+
# In case of a redirect, save an extra redirect on the request.get call,
|
| 1587 |
+
# and ensure we download the exact atomic version even if it changed
|
| 1588 |
+
# between the HEAD and the GET (unlikely, but hey).
|
| 1589 |
+
#
|
| 1590 |
+
# If url domain is different => we are downloading from a CDN => url is signed => don't send auth
|
| 1591 |
+
# If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
|
| 1592 |
+
if xet_file_data is None and url != metadata.location:
|
| 1593 |
+
url_to_download = metadata.location
|
| 1594 |
+
if urlparse(url).netloc != urlparse(metadata.location).netloc:
|
| 1595 |
+
# Remove authorization header when downloading a LFS blob
|
| 1596 |
+
headers.pop("authorization", None)
|
| 1597 |
+
except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
|
| 1598 |
+
# Actually raise for those subclasses of ConnectionError
|
| 1599 |
+
raise
|
| 1600 |
+
except (
|
| 1601 |
+
requests.exceptions.ConnectionError,
|
| 1602 |
+
requests.exceptions.Timeout,
|
| 1603 |
+
OfflineModeIsEnabled,
|
| 1604 |
+
) as error:
|
| 1605 |
+
# Otherwise, our Internet connection is down.
|
| 1606 |
+
# etag is None
|
| 1607 |
+
head_error_call = error
|
| 1608 |
+
except (RevisionNotFoundError, EntryNotFoundError):
|
| 1609 |
+
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
| 1610 |
+
raise
|
| 1611 |
+
except requests.HTTPError as error:
|
| 1612 |
+
# Multiple reasons for an http error:
|
| 1613 |
+
# - Repository is private and invalid/missing token sent
|
| 1614 |
+
# - Repository is gated and invalid/missing token sent
|
| 1615 |
+
# - Hub is down (error 500 or 504)
|
| 1616 |
+
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
| 1617 |
+
# (if it's not the case, the error will be re-raised)
|
| 1618 |
+
head_error_call = error
|
| 1619 |
+
except FileMetadataError as error:
|
| 1620 |
+
# Multiple reasons for a FileMetadataError:
|
| 1621 |
+
# - Wrong network configuration (proxy, firewall, SSL certificates)
|
| 1622 |
+
# - Inconsistency on the Hub
|
| 1623 |
+
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
| 1624 |
+
# (if it's not the case, the error will be re-raised)
|
| 1625 |
+
head_error_call = error
|
| 1626 |
+
|
| 1627 |
+
if not (local_files_only or etag is not None or head_error_call is not None):
|
| 1628 |
+
raise RuntimeError("etag is empty due to uncovered problems")
|
| 1629 |
+
|
| 1630 |
+
return (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_error_call) # type: ignore [return-value]
|
| 1631 |
+
|
| 1632 |
+
|
| 1633 |
+
def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
|
| 1634 |
+
"""Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
|
| 1635 |
+
# No head call => we cannot force download.
|
| 1636 |
+
if force_download:
|
| 1637 |
+
if local_files_only:
|
| 1638 |
+
raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
|
| 1639 |
+
elif isinstance(head_call_error, OfflineModeIsEnabled):
|
| 1640 |
+
raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
|
| 1641 |
+
else:
|
| 1642 |
+
raise ValueError("Force download failed due to the above error.") from head_call_error
|
| 1643 |
+
|
| 1644 |
+
# No head call + couldn't find an appropriate file on disk => raise an error.
|
| 1645 |
+
if local_files_only:
|
| 1646 |
+
raise LocalEntryNotFoundError(
|
| 1647 |
+
"Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
|
| 1648 |
+
" hf.co look-ups and downloads online, set 'local_files_only' to False."
|
| 1649 |
+
)
|
| 1650 |
+
elif isinstance(head_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
|
| 1651 |
+
isinstance(head_call_error, HfHubHTTPError) and head_call_error.response.status_code == 401
|
| 1652 |
+
):
|
| 1653 |
+
# Repo not found or gated => let's raise the actual error
|
| 1654 |
+
# Unauthorized => likely a token issue => let's raise the actual error
|
| 1655 |
+
raise head_call_error
|
| 1656 |
+
else:
|
| 1657 |
+
# Otherwise: most likely a connection issue or Hub downtime => let's warn the user
|
| 1658 |
+
raise LocalEntryNotFoundError(
|
| 1659 |
+
"An error happened while trying to locate the file on the Hub and we cannot find the requested files"
|
| 1660 |
+
" in the local cache. Please check your connection and try again or make sure your Internet connection"
|
| 1661 |
+
" is on."
|
| 1662 |
+
) from head_call_error
|
| 1663 |
+
|
| 1664 |
+
|
| 1665 |
+
def _download_to_tmp_and_move(
|
| 1666 |
+
incomplete_path: Path,
|
| 1667 |
+
destination_path: Path,
|
| 1668 |
+
url_to_download: str,
|
| 1669 |
+
proxies: Optional[Dict],
|
| 1670 |
+
headers: Dict[str, str],
|
| 1671 |
+
expected_size: Optional[int],
|
| 1672 |
+
filename: str,
|
| 1673 |
+
force_download: bool,
|
| 1674 |
+
etag: Optional[str],
|
| 1675 |
+
xet_file_data: Optional[XetFileData],
|
| 1676 |
+
) -> None:
|
| 1677 |
+
"""Download content from a URL to a destination path.
|
| 1678 |
+
|
| 1679 |
+
Internal logic:
|
| 1680 |
+
- return early if file is already downloaded
|
| 1681 |
+
- resume download if possible (from incomplete file)
|
| 1682 |
+
- do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
|
| 1683 |
+
- check disk space before downloading
|
| 1684 |
+
- download content to a temporary file
|
| 1685 |
+
- set correct permissions on temporary file
|
| 1686 |
+
- move the temporary file to the destination path
|
| 1687 |
+
|
| 1688 |
+
Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
|
| 1689 |
+
"""
|
| 1690 |
+
if destination_path.exists() and not force_download:
|
| 1691 |
+
# Do nothing if already exists (except if force_download=True)
|
| 1692 |
+
return
|
| 1693 |
+
|
| 1694 |
+
if incomplete_path.exists() and (force_download or (constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
|
| 1695 |
+
# By default, we will try to resume the download if possible.
|
| 1696 |
+
# However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
|
| 1697 |
+
# not resume the download => delete the incomplete file.
|
| 1698 |
+
message = f"Removing incomplete file '{incomplete_path}'"
|
| 1699 |
+
if force_download:
|
| 1700 |
+
message += " (force_download=True)"
|
| 1701 |
+
elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
|
| 1702 |
+
message += " (hf_transfer=True)"
|
| 1703 |
+
logger.info(message)
|
| 1704 |
+
incomplete_path.unlink(missing_ok=True)
|
| 1705 |
+
|
| 1706 |
+
with incomplete_path.open("ab") as f:
|
| 1707 |
+
resume_size = f.tell()
|
| 1708 |
+
message = f"Downloading '{filename}' to '{incomplete_path}'"
|
| 1709 |
+
if resume_size > 0 and expected_size is not None:
|
| 1710 |
+
message += f" (resume from {resume_size}/{expected_size})"
|
| 1711 |
+
logger.info(message)
|
| 1712 |
+
|
| 1713 |
+
if expected_size is not None: # might be None if HTTP header not set correctly
|
| 1714 |
+
# Check disk space in both tmp and destination path
|
| 1715 |
+
_check_disk_space(expected_size, incomplete_path.parent)
|
| 1716 |
+
_check_disk_space(expected_size, destination_path.parent)
|
| 1717 |
+
|
| 1718 |
+
if xet_file_data is not None and is_xet_available():
|
| 1719 |
+
logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
|
| 1720 |
+
xet_get(
|
| 1721 |
+
incomplete_path=incomplete_path,
|
| 1722 |
+
xet_file_data=xet_file_data,
|
| 1723 |
+
headers=headers,
|
| 1724 |
+
expected_size=expected_size,
|
| 1725 |
+
displayed_filename=filename,
|
| 1726 |
+
)
|
| 1727 |
+
else:
|
| 1728 |
+
if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
|
| 1729 |
+
logger.warning(
|
| 1730 |
+
"Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
|
| 1731 |
+
"Falling back to regular HTTP download. "
|
| 1732 |
+
"For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`"
|
| 1733 |
+
)
|
| 1734 |
+
|
| 1735 |
+
http_get(
|
| 1736 |
+
url_to_download,
|
| 1737 |
+
f,
|
| 1738 |
+
proxies=proxies,
|
| 1739 |
+
resume_size=resume_size,
|
| 1740 |
+
headers=headers,
|
| 1741 |
+
expected_size=expected_size,
|
| 1742 |
+
)
|
| 1743 |
+
|
| 1744 |
+
logger.info(f"Download complete. Moving file to {destination_path}")
|
| 1745 |
+
_chmod_and_move(incomplete_path, destination_path)
|
| 1746 |
+
|
| 1747 |
+
|
| 1748 |
+
def _int_or_none(value: Optional[str]) -> Optional[int]:
|
| 1749 |
+
try:
|
| 1750 |
+
return int(value) # type: ignore
|
| 1751 |
+
except (TypeError, ValueError):
|
| 1752 |
+
return None
|
| 1753 |
+
|
| 1754 |
+
|
| 1755 |
+
def _chmod_and_move(src: Path, dst: Path) -> None:
|
| 1756 |
+
"""Set correct permission before moving a blob from tmp directory to cache dir.
|
| 1757 |
+
|
| 1758 |
+
Do not take into account the `umask` from the process as there is no convenient way
|
| 1759 |
+
to get it that is thread-safe.
|
| 1760 |
+
|
| 1761 |
+
See:
|
| 1762 |
+
- About umask: https://docs.python.org/3/library/os.html#os.umask
|
| 1763 |
+
- Thread-safety: https://stackoverflow.com/a/70343066
|
| 1764 |
+
- About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
|
| 1765 |
+
- Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
|
| 1766 |
+
- Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
|
| 1767 |
+
"""
|
| 1768 |
+
# Get umask by creating a temporary file in the cached repo folder.
|
| 1769 |
+
tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
|
| 1770 |
+
try:
|
| 1771 |
+
tmp_file.touch()
|
| 1772 |
+
cache_dir_mode = Path(tmp_file).stat().st_mode
|
| 1773 |
+
os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
|
| 1774 |
+
except OSError as e:
|
| 1775 |
+
logger.warning(
|
| 1776 |
+
f"Could not set the permissions on the file '{src}'. Error: {e}.\nContinuing without setting permissions."
|
| 1777 |
+
)
|
| 1778 |
+
finally:
|
| 1779 |
+
try:
|
| 1780 |
+
tmp_file.unlink()
|
| 1781 |
+
except OSError:
|
| 1782 |
+
# fails if `tmp_file.touch()` failed => do nothing
|
| 1783 |
+
# See https://github.com/huggingface/huggingface_hub/issues/2359
|
| 1784 |
+
pass
|
| 1785 |
+
|
| 1786 |
+
shutil.move(str(src), str(dst), copy_function=_copy_no_matter_what)
|
| 1787 |
+
|
| 1788 |
+
|
| 1789 |
+
def _copy_no_matter_what(src: str, dst: str) -> None:
|
| 1790 |
+
"""Copy file from src to dst.
|
| 1791 |
+
|
| 1792 |
+
If `shutil.copy2` fails, fallback to `shutil.copyfile`.
|
| 1793 |
+
"""
|
| 1794 |
+
try:
|
| 1795 |
+
# Copy file with metadata and permission
|
| 1796 |
+
# Can fail e.g. if dst is an S3 mount
|
| 1797 |
+
shutil.copy2(src, dst)
|
| 1798 |
+
except OSError:
|
| 1799 |
+
# Copy only file content
|
| 1800 |
+
shutil.copyfile(src, dst)
|
| 1801 |
+
|
| 1802 |
+
|
| 1803 |
+
def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
|
| 1804 |
+
# Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
|
| 1805 |
+
snapshot_path = os.path.join(storage_folder, "snapshots")
|
| 1806 |
+
pointer_path = os.path.join(snapshot_path, revision, relative_filename)
|
| 1807 |
+
if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
|
| 1808 |
+
raise ValueError(
|
| 1809 |
+
"Invalid pointer path: cannot create pointer path in snapshot folder if"
|
| 1810 |
+
f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
|
| 1811 |
+
f" `relative_filename='{relative_filename}'`."
|
| 1812 |
+
)
|
| 1813 |
+
return pointer_path
|