diff --git a/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/INSTALLER b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/RECORD b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..1d1ee3ba29fad4ce6631c1c0b61023dd6be8e2b8 --- /dev/null +++ b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/RECORD @@ -0,0 +1,92 @@ +anyio-4.12.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +anyio-4.12.0.dist-info/METADATA,sha256=rte2_C2hYKP9_iVMFYogSzBxdHBzwY45S1TrLiBsxdk,4277 +anyio-4.12.0.dist-info/RECORD,, +anyio-4.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91 +anyio-4.12.0.dist-info/entry_points.txt,sha256=_d6Yu6uiaZmNe0CydowirE9Cmg7zUL2g08tQpoS3Qvc,39 +anyio-4.12.0.dist-info/licenses/LICENSE,sha256=U2GsncWPLvX9LpsJxoKXwX8ElQkJu8gCO9uC6s8iwrA,1081 +anyio-4.12.0.dist-info/top_level.txt,sha256=QglSMiWX8_5dpoVAEIHdEYzvqFMdSYWmCj6tYw2ITkQ,6 +anyio/__init__.py,sha256=7iDVqMUprUuKNY91FuoKqayAhR-OY136YDPI6P78HHk,6170 +anyio/__pycache__/__init__.cpython-313.pyc,, +anyio/__pycache__/from_thread.cpython-313.pyc,, +anyio/__pycache__/functools.cpython-313.pyc,, +anyio/__pycache__/lowlevel.cpython-313.pyc,, +anyio/__pycache__/pytest_plugin.cpython-313.pyc,, +anyio/__pycache__/to_interpreter.cpython-313.pyc,, +anyio/__pycache__/to_process.cpython-313.pyc,, +anyio/__pycache__/to_thread.cpython-313.pyc,, +anyio/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/_backends/__pycache__/__init__.cpython-313.pyc,, +anyio/_backends/__pycache__/_asyncio.cpython-313.pyc,, +anyio/_backends/__pycache__/_trio.cpython-313.pyc,, +anyio/_backends/_asyncio.py,sha256=w6gCSMs_2D1doKVtzi32bOloBl1df-IHubl8-Vks908,99656 +anyio/_backends/_trio.py,sha256=ScNVMQB0iiuJMAon1epQCVOVbIbf-Lxnfb5OxujzMok,42398 +anyio/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/_core/__pycache__/__init__.cpython-313.pyc,, +anyio/_core/__pycache__/_asyncio_selector_thread.cpython-313.pyc,, +anyio/_core/__pycache__/_contextmanagers.cpython-313.pyc,, +anyio/_core/__pycache__/_eventloop.cpython-313.pyc,, +anyio/_core/__pycache__/_exceptions.cpython-313.pyc,, +anyio/_core/__pycache__/_fileio.cpython-313.pyc,, +anyio/_core/__pycache__/_resources.cpython-313.pyc,, +anyio/_core/__pycache__/_signals.cpython-313.pyc,, +anyio/_core/__pycache__/_sockets.cpython-313.pyc,, +anyio/_core/__pycache__/_streams.cpython-313.pyc,, +anyio/_core/__pycache__/_subprocesses.cpython-313.pyc,, +anyio/_core/__pycache__/_synchronization.cpython-313.pyc,, +anyio/_core/__pycache__/_tasks.cpython-313.pyc,, +anyio/_core/__pycache__/_tempfile.cpython-313.pyc,, +anyio/_core/__pycache__/_testing.cpython-313.pyc,, +anyio/_core/__pycache__/_typedattr.cpython-313.pyc,, +anyio/_core/_asyncio_selector_thread.py,sha256=2PdxFM3cs02Kp6BSppbvmRT7q7asreTW5FgBxEsflBo,5626 +anyio/_core/_contextmanagers.py,sha256=YInBCabiEeS-UaP_Jdxa1CaFC71ETPW8HZTHIM8Rsc8,7215 +anyio/_core/_eventloop.py,sha256=xsoYgHIddNYusTqAFDVmcvpjHKJFmdgtDcAjpN3JEWQ,6261 +anyio/_core/_exceptions.py,sha256=fR2SvRUBYVHvolNKbzWSLt8FC_5NFB2OAzGD738fD8Q,4257 +anyio/_core/_fileio.py,sha256=uc7t10Vb-If7GbdWM_zFf-ajUe6uek63fSt7IBLlZW0,25731 +anyio/_core/_resources.py,sha256=NbmU5O5UX3xEyACnkmYX28Fmwdl-f-ny0tHym26e0w0,435 +anyio/_core/_signals.py,sha256=vulT1M1xdLYtAR-eY5TamIgaf1WTlOwOrMGwswlTTr8,905 +anyio/_core/_sockets.py,sha256=aTbgMr0qPmBPfrapxLykyajsmS7IAerhW9_Qk5r5E18,34311 +anyio/_core/_streams.py,sha256=FczFwIgDpnkK0bODWJXMpsUJYdvAD04kaUaGzJU8DK0,1806 +anyio/_core/_subprocesses.py,sha256=EXm5igL7dj55iYkPlbYVAqtbqxJxjU-6OndSTIx9SRg,8047 +anyio/_core/_synchronization.py,sha256=SY3nsr1ZZyDrjamsOVoYcvj-x6d_AR13Cu5lZecG0gY,20894 +anyio/_core/_tasks.py,sha256=km6hVE1fsuIenya3MDud8KP6-J_bNzlgYC10wUxI7iA,4880 +anyio/_core/_tempfile.py,sha256=lHb7CW4FyIlpkf5ADAf4VmLHCKwEHF9nxqNyBCFFUiA,19697 +anyio/_core/_testing.py,sha256=YUGwA5cgFFbUTv4WFd7cv_BSVr4ryTtPp8owQA3JdWE,2118 +anyio/_core/_typedattr.py,sha256=P4ozZikn3-DbpoYcvyghS_FOYAgbmUxeoU8-L_07pZM,2508 +anyio/abc/__init__.py,sha256=6mWhcl_pGXhrgZVHP_TCfMvIXIOp9mroEFM90fYCU_U,2869 +anyio/abc/__pycache__/__init__.cpython-313.pyc,, +anyio/abc/__pycache__/_eventloop.cpython-313.pyc,, +anyio/abc/__pycache__/_resources.cpython-313.pyc,, +anyio/abc/__pycache__/_sockets.cpython-313.pyc,, +anyio/abc/__pycache__/_streams.cpython-313.pyc,, +anyio/abc/__pycache__/_subprocesses.cpython-313.pyc,, +anyio/abc/__pycache__/_tasks.cpython-313.pyc,, +anyio/abc/__pycache__/_testing.cpython-313.pyc,, +anyio/abc/_eventloop.py,sha256=GTZbdItBHcj_b-8K2XylET2-bBYLZ3XjW4snY7vK7LE,10900 +anyio/abc/_resources.py,sha256=DrYvkNN1hH6Uvv5_5uKySvDsnknGVDe8FCKfko0VtN8,783 +anyio/abc/_sockets.py,sha256=ECTY0jLEF18gryANHR3vFzXzGdZ-xPwELq1QdgOb0Jo,13258 +anyio/abc/_streams.py,sha256=005GKSCXGprxnhucILboSqc2JFovECZk9m3p-qqxXVc,7640 +anyio/abc/_subprocesses.py,sha256=cumAPJTktOQtw63IqG0lDpyZqu_l1EElvQHMiwJgL08,2067 +anyio/abc/_tasks.py,sha256=KC7wrciE48AINOI-AhPutnFhe1ewfP7QnamFlDzqesQ,3721 +anyio/abc/_testing.py,sha256=tBJUzkSfOXJw23fe8qSJ03kJlShOYjjaEyFB6k6MYT8,1821 +anyio/from_thread.py,sha256=-YZOTpu9WVHtAsMxQGIOaHMjaDRNeKQilx6Nn2qDU-o,19017 +anyio/functools.py,sha256=tIWQ90cuLMxfJIpdBfFY3W3CC1zqFCRAyR3DxKc0Xlo,10061 +anyio/lowlevel.py,sha256=NnPYQ6tWDzLRwpalX2CvsbkXkTeasbJcL52gPopWdYg,5048 +anyio/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/pytest_plugin.py,sha256=3jAFQn0jv_pyoWE2GBBlHaj9sqXj4e8vob0_hgrsXE8,10244 +anyio/streams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/streams/__pycache__/__init__.cpython-313.pyc,, +anyio/streams/__pycache__/buffered.cpython-313.pyc,, +anyio/streams/__pycache__/file.cpython-313.pyc,, +anyio/streams/__pycache__/memory.cpython-313.pyc,, +anyio/streams/__pycache__/stapled.cpython-313.pyc,, +anyio/streams/__pycache__/text.cpython-313.pyc,, +anyio/streams/__pycache__/tls.cpython-313.pyc,, +anyio/streams/buffered.py,sha256=2R3PeJhe4EXrdYqz44Y6-Eg9R6DrmlsYrP36Ir43-po,6263 +anyio/streams/file.py,sha256=4WZ7XGz5WNu39FQHvqbe__TQ0HDP9OOhgO1mk9iVpVU,4470 +anyio/streams/memory.py,sha256=F0zwzvFJKAhX_LRZGoKzzqDC2oMM-f-yyTBrEYEGOaU,10740 +anyio/streams/stapled.py,sha256=T8Xqwf8K6EgURPxbt1N4i7A8BAk-gScv-GRhjLXIf_o,4390 +anyio/streams/text.py,sha256=BcVAGJw1VRvtIqnv-o0Rb0pwH7p8vwlvl21xHq522ag,5765 +anyio/streams/tls.py,sha256=Jpxy0Mfbcp1BxHCwE-YjSSFaLnIBbnnwur-excYThs4,15368 +anyio/to_interpreter.py,sha256=_mLngrMy97TMR6VbW4Y6YzDUk9ZuPcQMPlkuyRh3C9k,7100 +anyio/to_process.py,sha256=cEyYUgb8LJVRJCfs6rK3aEM_T3k2gEmhl0nBjEvflOk,9687 +anyio/to_thread.py,sha256=tXQPvHohvQ2Vrw2pBtdzkRPNV7u3H2_UDbvwL2u_R7k,2465 diff --git a/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/WHEEL b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..e7fa31b6f3f78deb1022c1f7927f07d4d16da822 --- /dev/null +++ b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (80.9.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/entry_points.txt b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..44dd9bdc3039122cc98014c1439ca254313fd014 --- /dev/null +++ b/env/lib/python3.13/site-packages/anyio-4.12.0.dist-info/entry_points.txt @@ -0,0 +1,2 @@ +[pytest11] +anyio = anyio.pytest_plugin diff --git a/env/lib/python3.13/site-packages/fsspec/__init__.py b/env/lib/python3.13/site-packages/fsspec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..452c78a055e72a6d04f1013d1a98fda33fdc449e --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/__init__.py @@ -0,0 +1,71 @@ +from . import caching +from ._version import __version__ # noqa: F401 +from .callbacks import Callback +from .compression import available_compressions +from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs +from .exceptions import FSTimeoutError +from .mapping import FSMap, get_mapper +from .registry import ( + available_protocols, + filesystem, + get_filesystem_class, + register_implementation, + registry, +) +from .spec import AbstractFileSystem + +__all__ = [ + "AbstractFileSystem", + "FSTimeoutError", + "FSMap", + "filesystem", + "register_implementation", + "get_filesystem_class", + "get_fs_token_paths", + "get_mapper", + "open", + "open_files", + "open_local", + "registry", + "caching", + "Callback", + "available_protocols", + "available_compressions", + "url_to_fs", +] + + +def process_entries(): + try: + from importlib.metadata import entry_points + except ImportError: + return + if entry_points is not None: + try: + eps = entry_points() + except TypeError: + pass # importlib-metadata < 0.8 + else: + if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0 + specs = eps.select(group="fsspec.specs") + else: + specs = eps.get("fsspec.specs", []) + registered_names = {} + for spec in specs: + err_msg = f"Unable to load filesystem from {spec}" + name = spec.name + if name in registered_names: + continue + registered_names[name] = True + register_implementation( + name, + spec.value.replace(":", "."), + errtxt=err_msg, + # We take our implementations as the ones to overload with if + # for some reason we encounter some, may be the same, already + # registered + clobber=True, + ) + + +process_entries() diff --git a/env/lib/python3.13/site-packages/fsspec/_version.py b/env/lib/python3.13/site-packages/fsspec/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..35dc196eb23aab6097dd424fad48d94f98979547 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/_version.py @@ -0,0 +1,34 @@ +# file generated by setuptools-scm +# don't change, don't track in version control + +__all__ = [ + "__version__", + "__version_tuple__", + "version", + "version_tuple", + "__commit_id__", + "commit_id", +] + +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple + from typing import Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] + COMMIT_ID = Union[str, None] +else: + VERSION_TUPLE = object + COMMIT_ID = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE +commit_id: COMMIT_ID +__commit_id__: COMMIT_ID + +__version__ = version = '2025.12.0' +__version_tuple__ = version_tuple = (2025, 12, 0) + +__commit_id__ = commit_id = None diff --git a/env/lib/python3.13/site-packages/fsspec/archive.py b/env/lib/python3.13/site-packages/fsspec/archive.py new file mode 100644 index 0000000000000000000000000000000000000000..13a4da8df7c9405297cdd7d37476be2f725b2f57 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/archive.py @@ -0,0 +1,75 @@ +import operator + +from fsspec import AbstractFileSystem +from fsspec.utils import tokenize + + +class AbstractArchiveFileSystem(AbstractFileSystem): + """ + A generic superclass for implementing Archive-based filesystems. + + Currently, it is shared amongst + :class:`~fsspec.implementations.zip.ZipFileSystem`, + :class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and + :class:`~fsspec.implementations.tar.TarFileSystem`. + """ + + def __str__(self): + return f"" + + __repr__ = __str__ + + def ukey(self, path): + return tokenize(path, self.fo, self.protocol) + + def _all_dirnames(self, paths): + """Returns *all* directory names for each path in paths, including intermediate + ones. + + Parameters + ---------- + paths: Iterable of path strings + """ + if len(paths) == 0: + return set() + + dirnames = {self._parent(path) for path in paths} - {self.root_marker} + return dirnames | self._all_dirnames(dirnames) + + def info(self, path, **kwargs): + self._get_dirs() + path = self._strip_protocol(path) + if path in {"", "/"} and self.dir_cache: + return {"name": "", "type": "directory", "size": 0} + if path in self.dir_cache: + return self.dir_cache[path] + elif path + "/" in self.dir_cache: + return self.dir_cache[path + "/"] + else: + raise FileNotFoundError(path) + + def ls(self, path, detail=True, **kwargs): + self._get_dirs() + paths = {} + for p, f in self.dir_cache.items(): + p = p.rstrip("/") + if "/" in p: + root = p.rsplit("/", 1)[0] + else: + root = "" + if root == path.rstrip("/"): + paths[p] = f + elif all( + (a == b) + for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) + ): + # root directory entry + ppath = p.rstrip("/").split("/", 1)[0] + if ppath not in paths: + out = {"name": ppath, "size": 0, "type": "directory"} + paths[ppath] = out + if detail: + out = sorted(paths.values(), key=operator.itemgetter("name")) + return out + else: + return sorted(paths) diff --git a/env/lib/python3.13/site-packages/fsspec/asyn.py b/env/lib/python3.13/site-packages/fsspec/asyn.py new file mode 100644 index 0000000000000000000000000000000000000000..360758ac64608331d0976b8ee17c2c02d1f3e6d7 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/asyn.py @@ -0,0 +1,1103 @@ +import asyncio +import asyncio.events +import functools +import inspect +import io +import numbers +import os +import re +import threading +from collections.abc import Iterable +from glob import has_magic +from typing import TYPE_CHECKING + +from .callbacks import DEFAULT_CALLBACK +from .exceptions import FSTimeoutError +from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep +from .spec import AbstractBufferedFile, AbstractFileSystem +from .utils import glob_translate, is_exception, other_paths + +private = re.compile("_[^_]") +iothread = [None] # dedicated fsspec IO thread +loop = [None] # global event loop for any non-async instance +_lock = None # global lock placeholder +get_running_loop = asyncio.get_running_loop + + +def get_lock(): + """Allocate or return a threading lock. + + The lock is allocated on first use to allow setting one lock per forked process. + """ + global _lock + if not _lock: + _lock = threading.Lock() + return _lock + + +def reset_lock(): + """Reset the global lock. + + This should be called only on the init of a forked process to reset the lock to + None, enabling the new forked process to get a new lock. + """ + global _lock + + iothread[0] = None + loop[0] = None + _lock = None + + +async def _runner(event, coro, result, timeout=None): + timeout = timeout if timeout else None # convert 0 or 0.0 to None + if timeout is not None: + coro = asyncio.wait_for(coro, timeout=timeout) + try: + result[0] = await coro + except Exception as ex: + result[0] = ex + finally: + event.set() + + +def sync(loop, func, *args, timeout=None, **kwargs): + """ + Make loop run coroutine until it returns. Runs in other thread + + Examples + -------- + >>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args, + timeout=timeout, **kwargs) + """ + timeout = timeout if timeout else None # convert 0 or 0.0 to None + # NB: if the loop is not running *yet*, it is OK to submit work + # and we will wait for it + if loop is None or loop.is_closed(): + raise RuntimeError("Loop is not running") + try: + loop0 = asyncio.events.get_running_loop() + if loop0 is loop: + raise NotImplementedError("Calling sync() from within a running loop") + except NotImplementedError: + raise + except RuntimeError: + pass + coro = func(*args, **kwargs) + result = [None] + event = threading.Event() + asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop) + while True: + # this loops allows thread to get interrupted + if event.wait(1): + break + if timeout is not None: + timeout -= 1 + if timeout < 0: + raise FSTimeoutError + + return_result = result[0] + if isinstance(return_result, asyncio.TimeoutError): + # suppress asyncio.TimeoutError, raise FSTimeoutError + raise FSTimeoutError from return_result + elif isinstance(return_result, BaseException): + raise return_result + else: + return return_result + + +def sync_wrapper(func, obj=None): + """Given a function, make so can be called in blocking contexts + + Leave obj=None if defining within a class. Pass the instance if attaching + as an attribute of the instance. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + self = obj or args[0] + return sync(self.loop, func, *args, **kwargs) + + return wrapper + + +def get_loop(): + """Create or return the default fsspec IO loop + + The loop will be running on a separate thread. + """ + if loop[0] is None: + with get_lock(): + # repeat the check just in case the loop got filled between the + # previous two calls from another thread + if loop[0] is None: + loop[0] = asyncio.new_event_loop() + th = threading.Thread(target=loop[0].run_forever, name="fsspecIO") + th.daemon = True + th.start() + iothread[0] = th + return loop[0] + + +def reset_after_fork(): + global lock + loop[0] = None + iothread[0] = None + lock = None + + +if hasattr(os, "register_at_fork"): + # should be posix; this will do nothing for spawn or forkserver subprocesses + os.register_at_fork(after_in_child=reset_after_fork) + + +if TYPE_CHECKING: + import resource + + ResourceError = resource.error +else: + try: + import resource + except ImportError: + resource = None + ResourceError = OSError + else: + ResourceError = getattr(resource, "error", OSError) + +_DEFAULT_BATCH_SIZE = 128 +_NOFILES_DEFAULT_BATCH_SIZE = 1280 + + +def _get_batch_size(nofiles=False): + from fsspec.config import conf + + if nofiles: + if "nofiles_gather_batch_size" in conf: + return conf["nofiles_gather_batch_size"] + else: + if "gather_batch_size" in conf: + return conf["gather_batch_size"] + if nofiles: + return _NOFILES_DEFAULT_BATCH_SIZE + if resource is None: + return _DEFAULT_BATCH_SIZE + + try: + soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE) + except (ImportError, ValueError, ResourceError): + return _DEFAULT_BATCH_SIZE + + if soft_limit == resource.RLIM_INFINITY: + return -1 + else: + return soft_limit // 8 + + +def running_async() -> bool: + """Being executed by an event loop?""" + try: + asyncio.get_running_loop() + return True + except RuntimeError: + return False + + +async def _run_coros_in_chunks( + coros, + batch_size=None, + callback=DEFAULT_CALLBACK, + timeout=None, + return_exceptions=False, + nofiles=False, +): + """Run the given coroutines in chunks. + + Parameters + ---------- + coros: list of coroutines to run + batch_size: int or None + Number of coroutines to submit/wait on simultaneously. + If -1, then it will not be any throttling. If + None, it will be inferred from _get_batch_size() + callback: fsspec.callbacks.Callback instance + Gets a relative_update when each coroutine completes + timeout: number or None + If given, each coroutine times out after this time. Note that, since + there are multiple batches, the total run time of this function will in + general be longer + return_exceptions: bool + Same meaning as in asyncio.gather + nofiles: bool + If inferring the batch_size, does this operation involve local files? + If yes, you normally expect smaller batches. + """ + + if batch_size is None: + batch_size = _get_batch_size(nofiles=nofiles) + + if batch_size == -1: + batch_size = len(coros) + + assert batch_size > 0 + + async def _run_coro(coro, i): + try: + return await asyncio.wait_for(coro, timeout=timeout), i + except Exception as e: + if not return_exceptions: + raise + return e, i + finally: + callback.relative_update(1) + + i = 0 + n = len(coros) + results = [None] * n + pending = set() + + while pending or i < n: + while len(pending) < batch_size and i < n: + pending.add(asyncio.ensure_future(_run_coro(coros[i], i))) + i += 1 + + if not pending: + break + + done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + while done: + result, k = await done.pop() + results[k] = result + + return results + + +# these methods should be implemented as async by any async-able backend +async_methods = [ + "_ls", + "_cat_file", + "_get_file", + "_put_file", + "_rm_file", + "_cp_file", + "_pipe_file", + "_expand_path", + "_info", + "_isfile", + "_isdir", + "_exists", + "_walk", + "_glob", + "_find", + "_du", + "_size", + "_mkdir", + "_makedirs", +] + + +class AsyncFileSystem(AbstractFileSystem): + """Async file operations, default implementations + + Passes bulk operations to asyncio.gather for concurrent operation. + + Implementations that have concurrent batch operations and/or async methods + should inherit from this class instead of AbstractFileSystem. Docstrings are + copied from the un-underscored method in AbstractFileSystem, if not given. + """ + + # note that methods do not have docstring here; they will be copied + # for _* methods and inferred for overridden methods. + + async_impl = True + mirror_sync_methods = True + disable_throttling = False + + def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs): + self.asynchronous = asynchronous + self._pid = os.getpid() + if not asynchronous: + self._loop = loop or get_loop() + else: + self._loop = None + self.batch_size = batch_size + super().__init__(*args, **kwargs) + + @property + def loop(self): + if self._pid != os.getpid(): + raise RuntimeError("This class is not fork-safe") + return self._loop + + async def _rm_file(self, path, **kwargs): + if ( + inspect.iscoroutinefunction(self._rm) + and type(self)._rm is not AsyncFileSystem._rm + ): + return await self._rm(path, recursive=False, batch_size=1, **kwargs) + raise NotImplementedError + + async def _rm(self, path, recursive=False, batch_size=None, **kwargs): + # TODO: implement on_error + batch_size = batch_size or self.batch_size + path = await self._expand_path(path, recursive=recursive) + return await _run_coros_in_chunks( + [self._rm_file(p, **kwargs) for p in reversed(path)], + batch_size=batch_size, + nofiles=True, + ) + + async def _cp_file(self, path1, path2, **kwargs): + raise NotImplementedError + + async def _mv_file(self, path1, path2): + await self._cp_file(path1, path2) + await self._rm_file(path1) + + async def _copy( + self, + path1, + path2, + recursive=False, + on_error=None, + maxdepth=None, + batch_size=None, + **kwargs, + ): + if on_error is None and recursive: + on_error = "ignore" + elif on_error is None: + on_error = "raise" + + if isinstance(path1, list) and isinstance(path2, list): + # No need to expand paths when both source and destination + # are provided as lists + paths1 = path1 + paths2 = path2 + else: + source_is_str = isinstance(path1, str) + paths1 = await self._expand_path( + path1, maxdepth=maxdepth, recursive=recursive + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + paths1 = [ + p for p in paths1 if not (trailing_sep(p) or await self._isdir(p)) + ] + if not paths1: + return + + source_is_file = len(paths1) == 1 + dest_is_dir = isinstance(path2, str) and ( + trailing_sep(path2) or await self._isdir(path2) + ) + + exists = source_is_str and ( + (has_magic(path1) and source_is_file) + or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1)) + ) + paths2 = other_paths( + paths1, + path2, + exists=exists, + flatten=not source_is_str, + ) + + batch_size = batch_size or self.batch_size + coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)] + result = await _run_coros_in_chunks( + coros, batch_size=batch_size, return_exceptions=True, nofiles=True + ) + + for ex in filter(is_exception, result): + if on_error == "ignore" and isinstance(ex, FileNotFoundError): + continue + raise ex + + async def _pipe_file(self, path, value, mode="overwrite", **kwargs): + raise NotImplementedError + + async def _pipe(self, path, value=None, batch_size=None, **kwargs): + if isinstance(path, str): + path = {path: value} + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + [self._pipe_file(k, v, **kwargs) for k, v in path.items()], + batch_size=batch_size, + nofiles=True, + ) + + async def _process_limits(self, url, start, end): + """Helper for "Range"-based _cat_file""" + size = None + suff = False + if start is not None and start < 0: + # if start is negative and end None, end is the "suffix length" + if end is None: + end = -start + start = "" + suff = True + else: + size = size or (await self._info(url))["size"] + start = size + start + elif start is None: + start = 0 + if not suff: + if end is not None and end < 0: + if start is not None: + size = size or (await self._info(url))["size"] + end = size + end + elif end is None: + end = "" + if isinstance(end, numbers.Integral): + end -= 1 # bytes range is inclusive + return f"bytes={start}-{end}" + + async def _cat_file(self, path, start=None, end=None, **kwargs): + raise NotImplementedError + + async def _cat( + self, path, recursive=False, on_error="raise", batch_size=None, **kwargs + ): + paths = await self._expand_path(path, recursive=recursive) + coros = [self._cat_file(path, **kwargs) for path in paths] + batch_size = batch_size or self.batch_size + out = await _run_coros_in_chunks( + coros, batch_size=batch_size, nofiles=True, return_exceptions=True + ) + if on_error == "raise": + ex = next(filter(is_exception, out), False) + if ex: + raise ex + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + return { + k: v + for k, v in zip(paths, out) + if on_error != "omit" or not is_exception(v) + } + else: + return out[0] + + async def _cat_ranges( + self, + paths, + starts, + ends, + max_gap=None, + batch_size=None, + on_error="return", + **kwargs, + ): + """Get the contents of byte ranges from one or more files + + Parameters + ---------- + paths: list + A list of of filepaths on this filesystems + starts, ends: int or list + Bytes limits of the read. If using a single int, the same value will be + used to read all the specified files. + """ + # TODO: on_error + if max_gap is not None: + # use utils.merge_offset_ranges + raise NotImplementedError + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, Iterable): + starts = [starts] * len(paths) + if not isinstance(ends, Iterable): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + coros = [ + self._cat_file(p, start=s, end=e, **kwargs) + for p, s, e in zip(paths, starts, ends) + ] + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + coros, batch_size=batch_size, nofiles=True, return_exceptions=True + ) + + async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs): + raise NotImplementedError + + async def _put( + self, + lpath, + rpath, + recursive=False, + callback=DEFAULT_CALLBACK, + batch_size=None, + maxdepth=None, + **kwargs, + ): + """Copy file(s) from local. + + Copies a specific file or tree of files (if recursive=True). If rpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. + + The put_file method will be called concurrently on a batch of files. The + batch_size option can configure the amount of futures that can be executed + at the same time. If it is -1, then all the files will be uploaded concurrently. + The default can be set for this instance by passing "batch_size" in the + constructor, or for all instances by setting the "gather_batch_size" key + in ``fsspec.config.conf``, falling back to 1/8th of the system limit . + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + source_is_str = isinstance(lpath, str) + if source_is_str: + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))] + if not lpaths: + return + + source_is_file = len(lpaths) == 1 + dest_is_dir = isinstance(rpath, str) and ( + trailing_sep(rpath) or await self._isdir(rpath) + ) + + rpath = self._strip_protocol(rpath) + exists = source_is_str and ( + (has_magic(lpath) and source_is_file) + or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath)) + ) + rpaths = other_paths( + lpaths, + rpath, + exists=exists, + flatten=not source_is_str, + ) + + is_dir = {l: os.path.isdir(l) for l in lpaths} + rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]] + file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]] + + await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs]) + batch_size = batch_size or self.batch_size + + coros = [] + callback.set_size(len(file_pairs)) + for lfile, rfile in file_pairs: + put_file = callback.branch_coro(self._put_file) + coros.append(put_file(lfile, rfile, **kwargs)) + + return await _run_coros_in_chunks( + coros, batch_size=batch_size, callback=callback + ) + + async def _get_file(self, rpath, lpath, **kwargs): + raise NotImplementedError + + async def _get( + self, + rpath, + lpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) to local. + + Copies a specific file or tree of files (if recursive=True). If lpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. Can submit a list of paths, which may be glob-patterns + and will be expanded. + + The get_file method will be called concurrently on a batch of files. The + batch_size option can configure the amount of futures that can be executed + at the same time. If it is -1, then all the files will be uploaded concurrently. + The default can be set for this instance by passing "batch_size" in the + constructor, or for all instances by setting the "gather_batch_size" key + in ``fsspec.config.conf``, falling back to 1/8th of the system limit . + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + source_is_str = isinstance(rpath, str) + # First check for rpath trailing slash as _strip_protocol removes it. + source_not_trailing_sep = source_is_str and not trailing_sep(rpath) + rpath = self._strip_protocol(rpath) + rpaths = await self._expand_path( + rpath, recursive=recursive, maxdepth=maxdepth + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + rpaths = [ + p for p in rpaths if not (trailing_sep(p) or await self._isdir(p)) + ] + if not rpaths: + return + + lpath = make_path_posix(lpath) + source_is_file = len(rpaths) == 1 + dest_is_dir = isinstance(lpath, str) and ( + trailing_sep(lpath) or LocalFileSystem().isdir(lpath) + ) + + exists = source_is_str and ( + (has_magic(rpath) and source_is_file) + or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep) + ) + lpaths = other_paths( + rpaths, + lpath, + exists=exists, + flatten=not source_is_str, + ) + + [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths] + batch_size = kwargs.pop("batch_size", self.batch_size) + + coros = [] + callback.set_size(len(lpaths)) + for lpath, rpath in zip(lpaths, rpaths): + get_file = callback.branch_coro(self._get_file) + coros.append(get_file(rpath, lpath, **kwargs)) + return await _run_coros_in_chunks( + coros, batch_size=batch_size, callback=callback + ) + + async def _isfile(self, path): + try: + return (await self._info(path))["type"] == "file" + except: # noqa: E722 + return False + + async def _isdir(self, path): + try: + return (await self._info(path))["type"] == "directory" + except OSError: + return False + + async def _size(self, path): + return (await self._info(path)).get("size", None) + + async def _sizes(self, paths, batch_size=None): + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + [self._size(p) for p in paths], batch_size=batch_size + ) + + async def _exists(self, path, **kwargs): + try: + await self._info(path, **kwargs) + return True + except FileNotFoundError: + return False + + async def _info(self, path, **kwargs): + raise NotImplementedError + + async def _ls(self, path, detail=True, **kwargs): + raise NotImplementedError + + async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + path = self._strip_protocol(path) + full_dirs = {} + dirs = {} + files = {} + + detail = kwargs.pop("detail", False) + try: + listing = await self._ls(path, detail=True, **kwargs) + except (FileNotFoundError, OSError) as e: + if on_error == "raise": + raise + elif callable(on_error): + on_error(e) + if detail: + yield path, {}, {} + else: + yield path, [], [] + return + + for info in listing: + # each info name must be at least [path]/part , but here + # we check also for names like [path]/part/ + pathname = info["name"].rstrip("/") + name = pathname.rsplit("/", 1)[-1] + if info["type"] == "directory" and pathname != path: + # do not include "self" path + full_dirs[name] = pathname + dirs[name] = info + elif pathname == path: + # file-like with same name as give path + files[""] = info + else: + files[name] = info + + if detail: + yield path, dirs, files + else: + yield path, list(dirs), list(files) + + if maxdepth is not None: + maxdepth -= 1 + if maxdepth < 1: + return + + for d in dirs: + async for _ in self._walk( + full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs + ): + yield _ + + async def _glob(self, path, maxdepth=None, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + import re + + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_qmark, idx_brace) + + detail = kwargs.pop("detail", False) + withdirs = kwargs.pop("withdirs", True) + + if not has_magic(path): + if await self._exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: await self._info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = await self._find( + root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs + ) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) + pattern = re.compile(pattern) + + out = { + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + } + + if detail: + return out + else: + return list(out) + + async def _du(self, path, total=True, maxdepth=None, **kwargs): + sizes = {} + # async for? + for f in await self._find(path, maxdepth=maxdepth, **kwargs): + info = await self._info(f) + sizes[info["name"]] = info["size"] + if total: + return sum(sizes.values()) + else: + return sizes + + async def _find(self, path, maxdepth=None, withdirs=False, **kwargs): + path = self._strip_protocol(path) + out = {} + detail = kwargs.pop("detail", False) + + # Add the root directory if withdirs is requested + # This is needed for posix glob compliance + if withdirs and path != "" and await self._isdir(path): + out[path] = await self._info(path) + + # async for? + async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs): + if withdirs: + files.update(dirs) + out.update({info["name"]: info for name, info in files.items()}) + if not out and (await self._isfile(path)): + # walk works on directories, but find should also return [path] + # when path happens to be a file + out[path] = {} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} + + async def _expand_path(self, path, recursive=False, maxdepth=None): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + if isinstance(path, str): + out = await self._expand_path([path], recursive, maxdepth) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: # can gather here + if has_magic(p): + bit = set(await self._glob(p, maxdepth=maxdepth)) + out |= bit + if recursive: + # glob call above expanded one depth so if maxdepth is defined + # then decrement it in expand_path call below. If it is zero + # after decrementing then avoid expand_path call. + if maxdepth is not None and maxdepth <= 1: + continue + out |= set( + await self._expand_path( + list(bit), + recursive=recursive, + maxdepth=maxdepth - 1 if maxdepth is not None else None, + ) + ) + continue + elif recursive: + rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True)) + out |= rec + if p not in out and (recursive is False or (await self._exists(p))): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return sorted(out) + + async def _mkdir(self, path, create_parents=True, **kwargs): + pass # not necessary to implement, may not have directories + + async def _makedirs(self, path, exist_ok=False): + pass # not necessary to implement, may not have directories + + async def open_async(self, path, mode="rb", **kwargs): + if "b" not in mode or kwargs.get("compression"): + raise ValueError + raise NotImplementedError + + +def mirror_sync_methods(obj): + """Populate sync and async methods for obj + + For each method will create a sync version if the name refers to an async method + (coroutine) and there is no override in the child class; will create an async + method for the corresponding sync method if there is no implementation. + + Uses the methods specified in + - async_methods: the set that an implementation is expected to provide + - default_async_methods: that can be derived from their sync version in + AbstractFileSystem + - AsyncFileSystem: async-specific default coroutines + """ + from fsspec import AbstractFileSystem + + for method in async_methods + dir(AsyncFileSystem): + if not method.startswith("_"): + continue + smethod = method[1:] + if private.match(method): + isco = inspect.iscoroutinefunction(getattr(obj, method, None)) + unsync = getattr(getattr(obj, smethod, False), "__func__", None) + is_default = unsync is getattr(AbstractFileSystem, smethod, "") + if isco and is_default: + mth = sync_wrapper(getattr(obj, method), obj=obj) + setattr(obj, smethod, mth) + if not mth.__doc__: + mth.__doc__ = getattr( + getattr(AbstractFileSystem, smethod, None), "__doc__", "" + ) + + +class FSSpecCoroutineCancel(Exception): + pass + + +def _dump_running_tasks( + printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False +): + import traceback + + tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()] + if printout: + [task.print_stack() for task in tasks] + out = [ + { + "locals": task._coro.cr_frame.f_locals, + "file": task._coro.cr_frame.f_code.co_filename, + "firstline": task._coro.cr_frame.f_code.co_firstlineno, + "linelo": task._coro.cr_frame.f_lineno, + "stack": traceback.format_stack(task._coro.cr_frame), + "task": task if with_task else None, + } + for task in tasks + ] + if cancel: + for t in tasks: + cbs = t._callbacks + t.cancel() + asyncio.futures.Future.set_exception(t, exc) + asyncio.futures.Future.cancel(t) + [cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures + try: + t._coro.throw(exc) # exits coro, unless explicitly handled + except exc: + pass + return out + + +class AbstractAsyncStreamedFile(AbstractBufferedFile): + # no read buffering, and always auto-commit + # TODO: readahead might still be useful here, but needs async version + + async def read(self, length=-1): + """ + Return data from cache, or fetch pieces as necessary + + Parameters + ---------- + length: int (-1) + Number of bytes to read; if <0, all remaining bytes. + """ + length = -1 if length is None else int(length) + if self.mode != "rb": + raise ValueError("File not in read mode") + if length < 0: + length = self.size - self.loc + if self.closed: + raise ValueError("I/O operation on closed file.") + if length == 0: + # don't even bother calling fetch + return b"" + out = await self._fetch_range(self.loc, self.loc + length) + self.loc += len(out) + return out + + async def write(self, data): + """ + Write data to buffer. + + Buffer only sent on flush() or if buffer is greater than + or equal to blocksize. + + Parameters + ---------- + data: bytes + Set of bytes to be written. + """ + if self.mode not in {"wb", "ab"}: + raise ValueError("File not in write mode") + if self.closed: + raise ValueError("I/O operation on closed file.") + if self.forced: + raise ValueError("This file has been force-flushed, can only close") + out = self.buffer.write(data) + self.loc += out + if self.buffer.tell() >= self.blocksize: + await self.flush() + return out + + async def close(self): + """Close file + + Finalizes writes, discards cache + """ + if getattr(self, "_unclosable", False): + return + if self.closed: + return + if self.mode == "rb": + self.cache = None + else: + if not self.forced: + await self.flush(force=True) + + if self.fs is not None: + self.fs.invalidate_cache(self.path) + self.fs.invalidate_cache(self.fs._parent(self.path)) + + self.closed = True + + async def flush(self, force=False): + if self.closed: + raise ValueError("Flush on closed file") + if force and self.forced: + raise ValueError("Force flush cannot be called more than once") + if force: + self.forced = True + + if self.mode not in {"wb", "ab"}: + # no-op to flush on read-mode + return + + if not force and self.buffer.tell() < self.blocksize: + # Defer write on small block + return + + if self.offset is None: + # Initialize a multipart upload + self.offset = 0 + try: + await self._initiate_upload() + except: + self.closed = True + raise + + if await self._upload_chunk(final=force) is not False: + self.offset += self.buffer.seek(0, 2) + self.buffer = io.BytesIO() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def _fetch_range(self, start, end): + raise NotImplementedError + + async def _initiate_upload(self): + pass + + async def _upload_chunk(self, final=False): + raise NotImplementedError diff --git a/env/lib/python3.13/site-packages/fsspec/caching.py b/env/lib/python3.13/site-packages/fsspec/caching.py new file mode 100644 index 0000000000000000000000000000000000000000..e91317ed8852b513eb512128e15d2cce04c2f0e4 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/caching.py @@ -0,0 +1,1011 @@ +from __future__ import annotations + +import collections +import functools +import logging +import math +import os +import threading +from collections import OrderedDict +from collections.abc import Callable +from concurrent.futures import Future, ThreadPoolExecutor +from itertools import groupby +from operator import itemgetter +from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar + +if TYPE_CHECKING: + import mmap + + from typing_extensions import ParamSpec + + P = ParamSpec("P") +else: + P = TypeVar("P") + +T = TypeVar("T") + + +logger = logging.getLogger("fsspec") + +Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes +MultiFetcher = Callable[[list[int, int]], bytes] # Maps [(start, end)] to bytes + + +class BaseCache: + """Pass-though cache: doesn't keep anything, calls every time + + Acts as base class for other cachers + + Parameters + ---------- + blocksize: int + How far to read ahead in numbers of bytes + fetcher: func + Function of the form f(start, end) which gets bytes from remote as + specified + size: int + How big this file is + """ + + name: ClassVar[str] = "none" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + self.blocksize = blocksize + self.nblocks = 0 + self.fetcher = fetcher + self.size = size + self.hit_count = 0 + self.miss_count = 0 + # the bytes that we actually requested + self.total_requested_bytes = 0 + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + if start is None: + start = 0 + if stop is None: + stop = self.size + if start >= self.size or start >= stop: + return b"" + return self.fetcher(start, stop) + + def _reset_stats(self) -> None: + """Reset hit and miss counts for a more ganular report e.g. by file.""" + self.hit_count = 0 + self.miss_count = 0 + self.total_requested_bytes = 0 + + def _log_stats(self) -> str: + """Return a formatted string of the cache statistics.""" + if self.hit_count == 0 and self.miss_count == 0: + # a cache that does nothing, this is for logs only + return "" + return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes" + + def __repr__(self) -> str: + # TODO: use rich for better formatting + return f""" + <{self.__class__.__name__}: + block size : {self.blocksize} + block count : {self.nblocks} + file size : {self.size} + cache hits : {self.hit_count} + cache misses: {self.miss_count} + total requested bytes: {self.total_requested_bytes}> + """ + + +class MMapCache(BaseCache): + """memory-mapped sparse file cache + + Opens temporary file, which is filled blocks-wise when data is requested. + Ensure there is enough disc space in the temporary location. + + This cache method might only work on posix + + Parameters + ---------- + blocksize: int + How far to read ahead in numbers of bytes + fetcher: Fetcher + Function of the form f(start, end) which gets bytes from remote as + specified + size: int + How big this file is + location: str + Where to create the temporary file. If None, a temporary file is + created using tempfile.TemporaryFile(). + blocks: set[int] + Set of block numbers that have already been fetched. If None, an empty + set is created. + multi_fetcher: MultiFetcher + Function of the form f([(start, end)]) which gets bytes from remote + as specified. This function is used to fetch multiple blocks at once. + If not specified, the fetcher function is used instead. + """ + + name = "mmap" + + def __init__( + self, + blocksize: int, + fetcher: Fetcher, + size: int, + location: str | None = None, + blocks: set[int] | None = None, + multi_fetcher: MultiFetcher | None = None, + ) -> None: + super().__init__(blocksize, fetcher, size) + self.blocks = set() if blocks is None else blocks + self.location = location + self.multi_fetcher = multi_fetcher + self.cache = self._makefile() + + def _makefile(self) -> mmap.mmap | bytearray: + import mmap + import tempfile + + if self.size == 0: + return bytearray() + + # posix version + if self.location is None or not os.path.exists(self.location): + if self.location is None: + fd = tempfile.TemporaryFile() + self.blocks = set() + else: + fd = open(self.location, "wb+") + fd.seek(self.size - 1) + fd.write(b"1") + fd.flush() + else: + fd = open(self.location, "r+b") + + return mmap.mmap(fd.fileno(), self.size) + + def _fetch(self, start: int | None, end: int | None) -> bytes: + logger.debug(f"MMap cache fetching {start}-{end}") + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + start_block = start // self.blocksize + end_block = end // self.blocksize + block_range = range(start_block, end_block + 1) + # Determine which blocks need to be fetched. This sequence is sorted by construction. + need = (i for i in block_range if i not in self.blocks) + # Count the number of blocks already cached + self.hit_count += sum(1 for i in block_range if i in self.blocks) + + ranges = [] + + # Consolidate needed blocks. + # Algorithm adapted from Python 2.x itertools documentation. + # We are grouping an enumerated sequence of blocks. By comparing when the difference + # between an ascending range (provided by enumerate) and the needed block numbers + # we can detect when the block number skips values. The key computes this difference. + # Whenever the difference changes, we know that we have previously cached block(s), + # and a new group is started. In other words, this algorithm neatly groups + # runs of consecutive block numbers so they can be fetched together. + for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]): + # Extract the blocks from the enumerated sequence + _blocks = tuple(map(itemgetter(1), _blocks)) + # Compute start of first block + sstart = _blocks[0] * self.blocksize + # Compute the end of the last block. Last block may not be full size. + send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size) + + # Fetch bytes (could be multiple consecutive blocks) + self.total_requested_bytes += send - sstart + logger.debug( + f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})" + ) + ranges.append((sstart, send)) + + # Update set of cached blocks + self.blocks.update(_blocks) + # Update cache statistics with number of blocks we had to cache + self.miss_count += len(_blocks) + + if not ranges: + return self.cache[start:end] + + if self.multi_fetcher: + logger.debug(f"MMap get blocks {ranges}") + for idx, r in enumerate(self.multi_fetcher(ranges)): + (sstart, send) = ranges[idx] + logger.debug(f"MMap copy block ({sstart}-{send}") + self.cache[sstart:send] = r + else: + for sstart, send in ranges: + logger.debug(f"MMap get block ({sstart}-{send}") + self.cache[sstart:send] = self.fetcher(sstart, send) + + return self.cache[start:end] + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__.copy() + # Remove the unpicklable entries. + del state["cache"] + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + # Restore instance attributes + self.__dict__.update(state) + self.cache = self._makefile() + + +class ReadAheadCache(BaseCache): + """Cache which reads only when we get beyond a block of data + + This is a much simpler version of BytesCache, and does not attempt to + fill holes in the cache or keep fragments alive. It is best suited to + many small reads in a sequential order (e.g., reading lines from a file). + """ + + name = "readahead" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + super().__init__(blocksize, fetcher, size) + self.cache = b"" + self.start = 0 + self.end = 0 + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None or end > self.size: + end = self.size + if start >= self.size or start >= end: + return b"" + l = end - start + if start >= self.start and end <= self.end: + # cache hit + self.hit_count += 1 + return self.cache[start - self.start : end - self.start] + elif self.start <= start < self.end: + # partial hit + self.miss_count += 1 + part = self.cache[start - self.start :] + l -= len(part) + start = self.end + else: + # miss + self.miss_count += 1 + part = b"" + end = min(self.size, end + self.blocksize) + self.total_requested_bytes += end - start + self.cache = self.fetcher(start, end) # new block replaces old + self.start = start + self.end = self.start + len(self.cache) + return part + self.cache[:l] + + +class FirstChunkCache(BaseCache): + """Caches the first block of a file only + + This may be useful for file types where the metadata is stored in the header, + but is randomly accessed. + """ + + name = "first" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + if blocksize > size: + # this will buffer the whole thing + blocksize = size + super().__init__(blocksize, fetcher, size) + self.cache: bytes | None = None + + def _fetch(self, start: int | None, end: int | None) -> bytes: + start = start or 0 + if start > self.size: + logger.debug("FirstChunkCache: requested start > file size") + return b"" + + end = min(end, self.size) + + if start < self.blocksize: + if self.cache is None: + self.miss_count += 1 + if end > self.blocksize: + self.total_requested_bytes += end + data = self.fetcher(0, end) + self.cache = data[: self.blocksize] + return data[start:] + self.cache = self.fetcher(0, self.blocksize) + self.total_requested_bytes += self.blocksize + part = self.cache[start:end] + if end > self.blocksize: + self.total_requested_bytes += end - self.blocksize + part += self.fetcher(self.blocksize, end) + self.hit_count += 1 + return part + else: + self.miss_count += 1 + self.total_requested_bytes += end - start + return self.fetcher(start, end) + + +class BlockCache(BaseCache): + """ + Cache holding memory as a set of blocks. + + Requests are only ever made ``blocksize`` at a time, and are + stored in an LRU cache. The least recently accessed block is + discarded when more than ``maxblocks`` are stored. + + Parameters + ---------- + blocksize : int + The number of bytes to store in each block. + Requests are only ever made for ``blocksize``, so this + should balance the overhead of making a request against + the granularity of the blocks. + fetcher : Callable + size : int + The total size of the file being cached. + maxblocks : int + The maximum number of blocks to cache for. The maximum memory + use for this cache is then ``blocksize * maxblocks``. + """ + + name = "blockcache" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32 + ) -> None: + super().__init__(blocksize, fetcher, size) + self.nblocks = math.ceil(size / blocksize) + self.maxblocks = maxblocks + self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) + + def cache_info(self): + """ + The statistics on the block cache. + + Returns + ------- + NamedTuple + Returned directly from the LRU Cache used internally. + """ + return self._fetch_block_cached.cache_info() + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__ + del state["_fetch_block_cached"] + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + self.__dict__.update(state) + self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( + self._fetch_block + ) + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + + # byte position -> block numbers + start_block_number = start // self.blocksize + end_block_number = end // self.blocksize + + # these are cached, so safe to do multiple calls for the same start and end. + for block_number in range(start_block_number, end_block_number + 1): + self._fetch_block_cached(block_number) + + return self._read_cache( + start, + end, + start_block_number=start_block_number, + end_block_number=end_block_number, + ) + + def _fetch_block(self, block_number: int) -> bytes: + """ + Fetch the block of data for `block_number`. + """ + if block_number > self.nblocks: + raise ValueError( + f"'block_number={block_number}' is greater than " + f"the number of blocks ({self.nblocks})" + ) + + start = block_number * self.blocksize + end = start + self.blocksize + self.total_requested_bytes += end - start + self.miss_count += 1 + logger.info("BlockCache fetching block %d", block_number) + block_contents = super()._fetch(start, end) + return block_contents + + def _read_cache( + self, start: int, end: int, start_block_number: int, end_block_number: int + ) -> bytes: + """ + Read from our block cache. + + Parameters + ---------- + start, end : int + The start and end byte positions. + start_block_number, end_block_number : int + The start and end block numbers. + """ + start_pos = start % self.blocksize + end_pos = end % self.blocksize + + self.hit_count += 1 + if start_block_number == end_block_number: + block: bytes = self._fetch_block_cached(start_block_number) + return block[start_pos:end_pos] + + else: + # read from the initial + out = [self._fetch_block_cached(start_block_number)[start_pos:]] + + # intermediate blocks + # Note: it'd be nice to combine these into one big request. However + # that doesn't play nicely with our LRU cache. + out.extend( + map( + self._fetch_block_cached, + range(start_block_number + 1, end_block_number), + ) + ) + + # final block + out.append(self._fetch_block_cached(end_block_number)[:end_pos]) + + return b"".join(out) + + +class BytesCache(BaseCache): + """Cache which holds data in a in-memory bytes object + + Implements read-ahead by the block size, for semi-random reads progressing + through the file. + + Parameters + ---------- + trim: bool + As we read more data, whether to discard the start of the buffer when + we are more than a blocksize ahead of it. + """ + + name: ClassVar[str] = "bytes" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True + ) -> None: + super().__init__(blocksize, fetcher, size) + self.cache = b"" + self.start: int | None = None + self.end: int | None = None + self.trim = trim + + def _fetch(self, start: int | None, end: int | None) -> bytes: + # TODO: only set start/end after fetch, in case it fails? + # is this where retry logic might go? + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + if ( + self.start is not None + and start >= self.start + and self.end is not None + and end < self.end + ): + # cache hit: we have all the required data + offset = start - self.start + self.hit_count += 1 + return self.cache[offset : offset + end - start] + + if self.blocksize: + bend = min(self.size, end + self.blocksize) + else: + bend = end + + if bend == start or start > self.size: + return b"" + + if (self.start is None or start < self.start) and ( + self.end is None or end > self.end + ): + # First read, or extending both before and after + self.total_requested_bytes += bend - start + self.miss_count += 1 + self.cache = self.fetcher(start, bend) + self.start = start + else: + assert self.start is not None + assert self.end is not None + self.miss_count += 1 + + if start < self.start: + if self.end is None or self.end - end > self.blocksize: + self.total_requested_bytes += bend - start + self.cache = self.fetcher(start, bend) + self.start = start + else: + self.total_requested_bytes += self.start - start + new = self.fetcher(start, self.start) + self.start = start + self.cache = new + self.cache + elif self.end is not None and bend > self.end: + if self.end > self.size: + pass + elif end - self.end > self.blocksize: + self.total_requested_bytes += bend - start + self.cache = self.fetcher(start, bend) + self.start = start + else: + self.total_requested_bytes += bend - self.end + new = self.fetcher(self.end, bend) + self.cache = self.cache + new + + self.end = self.start + len(self.cache) + offset = start - self.start + out = self.cache[offset : offset + end - start] + if self.trim: + num = (self.end - self.start) // (self.blocksize + 1) + if num > 1: + self.start += self.blocksize * num + self.cache = self.cache[self.blocksize * num :] + return out + + def __len__(self) -> int: + return len(self.cache) + + +class AllBytes(BaseCache): + """Cache entire contents of the file""" + + name: ClassVar[str] = "all" + + def __init__( + self, + blocksize: int | None = None, + fetcher: Fetcher | None = None, + size: int | None = None, + data: bytes | None = None, + ) -> None: + super().__init__(blocksize, fetcher, size) # type: ignore[arg-type] + if data is None: + self.miss_count += 1 + self.total_requested_bytes += self.size + data = self.fetcher(0, self.size) + self.data = data + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + self.hit_count += 1 + return self.data[start:stop] + + +class KnownPartsOfAFile(BaseCache): + """ + Cache holding known file parts. + + Parameters + ---------- + blocksize: int + How far to read ahead in numbers of bytes + fetcher: func + Function of the form f(start, end) which gets bytes from remote as + specified + size: int + How big this file is + data: dict + A dictionary mapping explicit `(start, stop)` file-offset tuples + with known bytes. + strict: bool, default True + Whether to fetch reads that go beyond a known byte-range boundary. + If `False`, any read that ends outside a known part will be zero + padded. Note that zero padding will not be used for reads that + begin outside a known byte-range. + """ + + name: ClassVar[str] = "parts" + + def __init__( + self, + blocksize: int, + fetcher: Fetcher, + size: int, + data: dict[tuple[int, int], bytes] | None = None, + strict: bool = False, + **_: Any, + ): + super().__init__(blocksize, fetcher, size) + self.strict = strict + + # simple consolidation of contiguous blocks + if data: + old_offsets = sorted(data.keys()) + offsets = [old_offsets[0]] + blocks = [data.pop(old_offsets[0])] + for start, stop in old_offsets[1:]: + start0, stop0 = offsets[-1] + if start == stop0: + offsets[-1] = (start0, stop) + blocks[-1] += data.pop((start, stop)) + else: + offsets.append((start, stop)) + blocks.append(data.pop((start, stop))) + + self.data = dict(zip(offsets, blocks)) + else: + self.data = {} + + @property + def size(self): + return sum(_[1] - _[0] for _ in self.data) + + @size.setter + def size(self, value): + pass + + @property + def nblocks(self): + return len(self.data) + + @nblocks.setter + def nblocks(self, value): + pass + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + if start is None: + start = 0 + if stop is None: + stop = self.size + self.total_requested_bytes += stop - start + + out = b"" + started = False + loc_old = 0 + for loc0, loc1 in sorted(self.data): + if (loc0 <= start < loc1) and (loc0 <= stop <= loc1): + # entirely within the block + off = start - loc0 + self.hit_count += 1 + return self.data[(loc0, loc1)][off : off + stop - start] + if stop <= loc0: + break + if started and loc0 > loc_old: + # a gap where we need data + self.miss_count += 1 + if self.strict: + raise ValueError + out += b"\x00" * (loc0 - loc_old) + if loc0 <= start < loc1: + # found the start + self.hit_count += 1 + off = start - loc0 + out = self.data[(loc0, loc1)][off : off + stop - start] + started = True + elif start < loc0 and stop > loc1: + # the whole block + self.hit_count += 1 + out += self.data[(loc0, loc1)] + elif loc0 <= stop <= loc1: + # end block + self.hit_count += 1 + return out + self.data[(loc0, loc1)][: stop - loc0] + loc_old = loc1 + self.miss_count += 1 + if started and not self.strict: + return out + b"\x00" * (stop - loc_old) + raise ValueError + + +class UpdatableLRU(Generic[P, T]): + """ + Custom implementation of LRU cache that allows updating keys + + Used by BackgroudBlockCache + """ + + class CacheInfo(NamedTuple): + hits: int + misses: int + maxsize: int + currsize: int + + def __init__(self, func: Callable[P, T], max_size: int = 128) -> None: + self._cache: OrderedDict[Any, T] = collections.OrderedDict() + self._func = func + self._max_size = max_size + self._hits = 0 + self._misses = 0 + self._lock = threading.Lock() + + def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T: + if kwargs: + raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}") + with self._lock: + if args in self._cache: + self._cache.move_to_end(args) + self._hits += 1 + return self._cache[args] + + result = self._func(*args, **kwargs) + + with self._lock: + self._cache[args] = result + self._misses += 1 + if len(self._cache) > self._max_size: + self._cache.popitem(last=False) + + return result + + def is_key_cached(self, *args: Any) -> bool: + with self._lock: + return args in self._cache + + def add_key(self, result: T, *args: Any) -> None: + with self._lock: + self._cache[args] = result + if len(self._cache) > self._max_size: + self._cache.popitem(last=False) + + def cache_info(self) -> UpdatableLRU.CacheInfo: + with self._lock: + return self.CacheInfo( + maxsize=self._max_size, + currsize=len(self._cache), + hits=self._hits, + misses=self._misses, + ) + + +class BackgroundBlockCache(BaseCache): + """ + Cache holding memory as a set of blocks with pre-loading of + the next block in the background. + + Requests are only ever made ``blocksize`` at a time, and are + stored in an LRU cache. The least recently accessed block is + discarded when more than ``maxblocks`` are stored. If the + next block is not in cache, it is loaded in a separate thread + in non-blocking way. + + Parameters + ---------- + blocksize : int + The number of bytes to store in each block. + Requests are only ever made for ``blocksize``, so this + should balance the overhead of making a request against + the granularity of the blocks. + fetcher : Callable + size : int + The total size of the file being cached. + maxblocks : int + The maximum number of blocks to cache for. The maximum memory + use for this cache is then ``blocksize * maxblocks``. + """ + + name: ClassVar[str] = "background" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32 + ) -> None: + super().__init__(blocksize, fetcher, size) + self.nblocks = math.ceil(size / blocksize) + self.maxblocks = maxblocks + self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks) + + self._thread_executor = ThreadPoolExecutor(max_workers=1) + self._fetch_future_block_number: int | None = None + self._fetch_future: Future[bytes] | None = None + self._fetch_future_lock = threading.Lock() + + def cache_info(self) -> UpdatableLRU.CacheInfo: + """ + The statistics on the block cache. + + Returns + ------- + NamedTuple + Returned directly from the LRU Cache used internally. + """ + return self._fetch_block_cached.cache_info() + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__ + del state["_fetch_block_cached"] + del state["_thread_executor"] + del state["_fetch_future_block_number"] + del state["_fetch_future"] + del state["_fetch_future_lock"] + return state + + def __setstate__(self, state) -> None: + self.__dict__.update(state) + self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"]) + self._thread_executor = ThreadPoolExecutor(max_workers=1) + self._fetch_future_block_number = None + self._fetch_future = None + self._fetch_future_lock = threading.Lock() + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + + # byte position -> block numbers + start_block_number = start // self.blocksize + end_block_number = end // self.blocksize + + fetch_future_block_number = None + fetch_future = None + with self._fetch_future_lock: + # Background thread is running. Check we we can or must join it. + if self._fetch_future is not None: + assert self._fetch_future_block_number is not None + if self._fetch_future.done(): + logger.info("BlockCache joined background fetch without waiting.") + self._fetch_block_cached.add_key( + self._fetch_future.result(), self._fetch_future_block_number + ) + # Cleanup the fetch variables. Done with fetching the block. + self._fetch_future_block_number = None + self._fetch_future = None + else: + # Must join if we need the block for the current fetch + must_join = bool( + start_block_number + <= self._fetch_future_block_number + <= end_block_number + ) + if must_join: + # Copy to the local variables to release lock + # before waiting for result + fetch_future_block_number = self._fetch_future_block_number + fetch_future = self._fetch_future + + # Cleanup the fetch variables. Have a local copy. + self._fetch_future_block_number = None + self._fetch_future = None + + # Need to wait for the future for the current read + if fetch_future is not None: + logger.info("BlockCache waiting for background fetch.") + # Wait until result and put it in cache + self._fetch_block_cached.add_key( + fetch_future.result(), fetch_future_block_number + ) + + # these are cached, so safe to do multiple calls for the same start and end. + for block_number in range(start_block_number, end_block_number + 1): + self._fetch_block_cached(block_number) + + # fetch next block in the background if nothing is running in the background, + # the block is within file and it is not already cached + end_block_plus_1 = end_block_number + 1 + with self._fetch_future_lock: + if ( + self._fetch_future is None + and end_block_plus_1 <= self.nblocks + and not self._fetch_block_cached.is_key_cached(end_block_plus_1) + ): + self._fetch_future_block_number = end_block_plus_1 + self._fetch_future = self._thread_executor.submit( + self._fetch_block, end_block_plus_1, "async" + ) + + return self._read_cache( + start, + end, + start_block_number=start_block_number, + end_block_number=end_block_number, + ) + + def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes: + """ + Fetch the block of data for `block_number`. + """ + if block_number > self.nblocks: + raise ValueError( + f"'block_number={block_number}' is greater than " + f"the number of blocks ({self.nblocks})" + ) + + start = block_number * self.blocksize + end = start + self.blocksize + logger.info("BlockCache fetching block (%s) %d", log_info, block_number) + self.total_requested_bytes += end - start + self.miss_count += 1 + block_contents = super()._fetch(start, end) + return block_contents + + def _read_cache( + self, start: int, end: int, start_block_number: int, end_block_number: int + ) -> bytes: + """ + Read from our block cache. + + Parameters + ---------- + start, end : int + The start and end byte positions. + start_block_number, end_block_number : int + The start and end block numbers. + """ + start_pos = start % self.blocksize + end_pos = end % self.blocksize + + # kind of pointless to count this as a hit, but it is + self.hit_count += 1 + + if start_block_number == end_block_number: + block = self._fetch_block_cached(start_block_number) + return block[start_pos:end_pos] + + else: + # read from the initial + out = [self._fetch_block_cached(start_block_number)[start_pos:]] + + # intermediate blocks + # Note: it'd be nice to combine these into one big request. However + # that doesn't play nicely with our LRU cache. + out.extend( + map( + self._fetch_block_cached, + range(start_block_number + 1, end_block_number), + ) + ) + + # final block + out.append(self._fetch_block_cached(end_block_number)[:end_pos]) + + return b"".join(out) + + +caches: dict[str | None, type[BaseCache]] = { + # one custom case + None: BaseCache, +} + + +def register_cache(cls: type[BaseCache], clobber: bool = False) -> None: + """'Register' cache implementation. + + Parameters + ---------- + clobber: bool, optional + If set to True (default is False) - allow to overwrite existing + entry. + + Raises + ------ + ValueError + """ + name = cls.name + if not clobber and name in caches: + raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}") + caches[name] = cls + + +for c in ( + BaseCache, + MMapCache, + BytesCache, + ReadAheadCache, + BlockCache, + FirstChunkCache, + AllBytes, + KnownPartsOfAFile, + BackgroundBlockCache, +): + register_cache(c) diff --git a/env/lib/python3.13/site-packages/fsspec/callbacks.py b/env/lib/python3.13/site-packages/fsspec/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..7ca99ca6ac3cd69b28bcd1550f6550e8e648c5fe --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/callbacks.py @@ -0,0 +1,324 @@ +from functools import wraps + + +class Callback: + """ + Base class and interface for callback mechanism + + This class can be used directly for monitoring file transfers by + providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument, + below), or subclassed for more specialised behaviour. + + Parameters + ---------- + size: int (optional) + Nominal quantity for the value that corresponds to a complete + transfer, e.g., total number of tiles or total number of + bytes + value: int (0) + Starting internal counter value + hooks: dict or None + A dict of named functions to be called on each update. The signature + of these must be ``f(size, value, **kwargs)`` + """ + + def __init__(self, size=None, value=0, hooks=None, **kwargs): + self.size = size + self.value = value + self.hooks = hooks or {} + self.kw = kwargs + + def __enter__(self): + return self + + def __exit__(self, *exc_args): + self.close() + + def close(self): + """Close callback.""" + + def branched(self, path_1, path_2, **kwargs): + """ + Return callback for child transfers + + If this callback is operating at a higher level, e.g., put, which may + trigger transfers that can also be monitored. The function returns a callback + that has to be passed to the child method, e.g., put_file, + as `callback=` argument. + + The implementation uses `callback.branch` for compatibility. + When implementing callbacks, it is recommended to override this function instead + of `branch` and avoid calling `super().branched(...)`. + + Prefer using this function over `branch`. + + Parameters + ---------- + path_1: str + Child's source path + path_2: str + Child's destination path + **kwargs: + Arbitrary keyword arguments + + Returns + ------- + callback: Callback + A callback instance to be passed to the child method + """ + self.branch(path_1, path_2, kwargs) + # mutate kwargs so that we can force the caller to pass "callback=" explicitly + return kwargs.pop("callback", DEFAULT_CALLBACK) + + def branch_coro(self, fn): + """ + Wraps a coroutine, and pass a new child callback to it. + """ + + @wraps(fn) + async def func(path1, path2: str, **kwargs): + with self.branched(path1, path2, **kwargs) as child: + return await fn(path1, path2, callback=child, **kwargs) + + return func + + def set_size(self, size): + """ + Set the internal maximum size attribute + + Usually called if not initially set at instantiation. Note that this + triggers a ``call()``. + + Parameters + ---------- + size: int + """ + self.size = size + self.call() + + def absolute_update(self, value): + """ + Set the internal value state + + Triggers ``call()`` + + Parameters + ---------- + value: int + """ + self.value = value + self.call() + + def relative_update(self, inc=1): + """ + Delta increment the internal counter + + Triggers ``call()`` + + Parameters + ---------- + inc: int + """ + self.value += inc + self.call() + + def call(self, hook_name=None, **kwargs): + """ + Execute hook(s) with current state + + Each function is passed the internal size and current value + + Parameters + ---------- + hook_name: str or None + If given, execute on this hook + kwargs: passed on to (all) hook(s) + """ + if not self.hooks: + return + kw = self.kw.copy() + kw.update(kwargs) + if hook_name: + if hook_name not in self.hooks: + return + return self.hooks[hook_name](self.size, self.value, **kw) + for hook in self.hooks.values() or []: + hook(self.size, self.value, **kw) + + def wrap(self, iterable): + """ + Wrap an iterable to call ``relative_update`` on each iterations + + Parameters + ---------- + iterable: Iterable + The iterable that is being wrapped + """ + for item in iterable: + self.relative_update() + yield item + + def branch(self, path_1, path_2, kwargs): + """ + Set callbacks for child transfers + + If this callback is operating at a higher level, e.g., put, which may + trigger transfers that can also be monitored. The passed kwargs are + to be *mutated* to add ``callback=``, if this class supports branching + to children. + + Parameters + ---------- + path_1: str + Child's source path + path_2: str + Child's destination path + kwargs: dict + arguments passed to child method, e.g., put_file. + + Returns + ------- + + """ + return None + + def no_op(self, *_, **__): + pass + + def __getattr__(self, item): + """ + If undefined methods are called on this class, nothing happens + """ + return self.no_op + + @classmethod + def as_callback(cls, maybe_callback=None): + """Transform callback=... into Callback instance + + For the special value of ``None``, return the global instance of + ``NoOpCallback``. This is an alternative to including + ``callback=DEFAULT_CALLBACK`` directly in a method signature. + """ + if maybe_callback is None: + return DEFAULT_CALLBACK + return maybe_callback + + +class NoOpCallback(Callback): + """ + This implementation of Callback does exactly nothing + """ + + def call(self, *args, **kwargs): + return None + + +class DotPrinterCallback(Callback): + """ + Simple example Callback implementation + + Almost identical to Callback with a hook that prints a char; here we + demonstrate how the outer layer may print "#" and the inner layer "." + """ + + def __init__(self, chr_to_print="#", **kwargs): + self.chr = chr_to_print + super().__init__(**kwargs) + + def branch(self, path_1, path_2, kwargs): + """Mutate kwargs to add new instance with different print char""" + kwargs["callback"] = DotPrinterCallback(".") + + def call(self, **kwargs): + """Just outputs a character""" + print(self.chr, end="") + + +class TqdmCallback(Callback): + """ + A callback to display a progress bar using tqdm + + Parameters + ---------- + tqdm_kwargs : dict, (optional) + Any argument accepted by the tqdm constructor. + See the `tqdm doc `_. + Will be forwarded to `tqdm_cls`. + tqdm_cls: (optional) + subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`. + + Examples + -------- + >>> import fsspec + >>> from fsspec.callbacks import TqdmCallback + >>> fs = fsspec.filesystem("memory") + >>> path2distant_data = "/your-path" + >>> fs.upload( + ".", + path2distant_data, + recursive=True, + callback=TqdmCallback(), + ) + + You can forward args to tqdm using the ``tqdm_kwargs`` parameter. + + >>> fs.upload( + ".", + path2distant_data, + recursive=True, + callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}), + ) + + You can also customize the progress bar by passing a subclass of `tqdm`. + + .. code-block:: python + + class TqdmFormat(tqdm): + '''Provides a `total_time` format parameter''' + @property + def format_dict(self): + d = super().format_dict + total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) + d.update(total_time=self.format_interval(total_time) + " in total") + return d + + >>> with TqdmCallback( + tqdm_kwargs={ + "desc": "desc", + "bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}", + }, + tqdm_cls=TqdmFormat, + ) as callback: + fs.upload(".", path2distant_data, recursive=True, callback=callback) + """ + + def __init__(self, tqdm_kwargs=None, *args, **kwargs): + try: + from tqdm import tqdm + + except ImportError as exce: + raise ImportError( + "Using TqdmCallback requires tqdm to be installed" + ) from exce + + self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm) + self._tqdm_kwargs = tqdm_kwargs or {} + self.tqdm = None + super().__init__(*args, **kwargs) + + def call(self, *args, **kwargs): + if self.tqdm is None: + self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs) + self.tqdm.total = self.size + self.tqdm.update(self.value - self.tqdm.n) + + def close(self): + if self.tqdm is not None: + self.tqdm.close() + self.tqdm = None + + def __del__(self): + return self.close() + + +DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback() diff --git a/env/lib/python3.13/site-packages/fsspec/compression.py b/env/lib/python3.13/site-packages/fsspec/compression.py new file mode 100644 index 0000000000000000000000000000000000000000..e21da562bbab49c2ad60e9d9beb546af8dadea45 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/compression.py @@ -0,0 +1,182 @@ +"""Helper functions for a standard streaming compression API""" + +from zipfile import ZipFile + +import fsspec.utils +from fsspec.spec import AbstractBufferedFile + + +def noop_file(file, mode, **kwargs): + return file + + +# TODO: files should also be available as contexts +# should be functions of the form func(infile, mode=, **kwargs) -> file-like +compr = {None: noop_file} + + +def register_compression(name, callback, extensions, force=False): + """Register an "inferable" file compression type. + + Registers transparent file compression type for use with fsspec.open. + Compression can be specified by name in open, or "infer"-ed for any files + ending with the given extensions. + + Args: + name: (str) The compression type name. Eg. "gzip". + callback: A callable of form (infile, mode, **kwargs) -> file-like. + Accepts an input file-like object, the target mode and kwargs. + Returns a wrapped file-like object. + extensions: (str, Iterable[str]) A file extension, or list of file + extensions for which to infer this compression scheme. Eg. "gz". + force: (bool) Force re-registration of compression type or extensions. + + Raises: + ValueError: If name or extensions already registered, and not force. + + """ + if isinstance(extensions, str): + extensions = [extensions] + + # Validate registration + if name in compr and not force: + raise ValueError(f"Duplicate compression registration: {name}") + + for ext in extensions: + if ext in fsspec.utils.compressions and not force: + raise ValueError(f"Duplicate compression file extension: {ext} ({name})") + + compr[name] = callback + + for ext in extensions: + fsspec.utils.compressions[ext] = name + + +def unzip(infile, mode="rb", filename=None, **kwargs): + if "r" not in mode: + filename = filename or "file" + z = ZipFile(infile, mode="w", **kwargs) + fo = z.open(filename, mode="w") + fo.close = lambda closer=fo.close: closer() or z.close() + return fo + z = ZipFile(infile) + if filename is None: + filename = z.namelist()[0] + return z.open(filename, mode="r", **kwargs) + + +register_compression("zip", unzip, "zip") + +try: + from bz2 import BZ2File +except ImportError: + pass +else: + register_compression("bz2", BZ2File, "bz2") + +try: # pragma: no cover + from isal import igzip + + def isal(infile, mode="rb", **kwargs): + return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs) + + register_compression("gzip", isal, "gz") +except ImportError: + from gzip import GzipFile + + register_compression( + "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz" + ) + +try: + from lzma import LZMAFile + + register_compression("lzma", LZMAFile, "lzma") + register_compression("xz", LZMAFile, "xz") +except ImportError: + pass + +try: + import lzmaffi + + register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True) + register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) +except ImportError: + pass + + +class SnappyFile(AbstractBufferedFile): + def __init__(self, infile, mode, **kwargs): + import snappy + + super().__init__( + fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs + ) + self.infile = infile + if "r" in mode: + self.codec = snappy.StreamDecompressor() + else: + self.codec = snappy.StreamCompressor() + + def _upload_chunk(self, final=False): + self.buffer.seek(0) + out = self.codec.add_chunk(self.buffer.read()) + self.infile.write(out) + return True + + def seek(self, loc, whence=0): + raise NotImplementedError("SnappyFile is not seekable") + + def seekable(self): + return False + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + data = self.infile.read(end - start) + return self.codec.decompress(data) + + +try: + import snappy + + snappy.compress(b"") + # Snappy may use the .sz file extension, but this is not part of the + # standard implementation. + register_compression("snappy", SnappyFile, []) + +except (ImportError, NameError, AttributeError): + pass + +try: + import lz4.frame + + register_compression("lz4", lz4.frame.open, "lz4") +except ImportError: + pass + +try: + # zstd in the standard library for python >= 3.14 + from compression.zstd import ZstdFile + + register_compression("zstd", ZstdFile, "zst") + +except ImportError: + try: + import zstandard as zstd + + def zstandard_file(infile, mode="rb"): + if "r" in mode: + cctx = zstd.ZstdDecompressor() + return cctx.stream_reader(infile) + else: + cctx = zstd.ZstdCompressor(level=10) + return cctx.stream_writer(infile) + + register_compression("zstd", zstandard_file, "zst") + except ImportError: + pass + + +def available_compressions(): + """Return a list of the implemented compressions.""" + return list(compr) diff --git a/env/lib/python3.13/site-packages/fsspec/config.py b/env/lib/python3.13/site-packages/fsspec/config.py new file mode 100644 index 0000000000000000000000000000000000000000..76d9af14aaf7df47c4551c169f27b05abf9c269e --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/config.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import configparser +import json +import os +import warnings +from typing import Any + +conf: dict[str, dict[str, Any]] = {} +default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec") +conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir) + + +def set_conf_env(conf_dict, envdict=os.environ): + """Set config values from environment variables + + Looks for variables of the form ``FSSPEC_`` and + ``FSSPEC__``. For ``FSSPEC_`` the value is parsed + as a json dictionary and used to ``update`` the config of the + corresponding protocol. For ``FSSPEC__`` there is no + attempt to convert the string value, but the kwarg keys will be lower-cased. + + The ``FSSPEC__`` variables are applied after the + ``FSSPEC_`` ones. + + Parameters + ---------- + conf_dict : dict(str, dict) + This dict will be mutated + envdict : dict-like(str, str) + Source for the values - usually the real environment + """ + kwarg_keys = [] + for key in envdict: + if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_": + if key.count("_") > 1: + kwarg_keys.append(key) + continue + try: + value = json.loads(envdict[key]) + except json.decoder.JSONDecodeError as ex: + warnings.warn( + f"Ignoring environment variable {key} due to a parse failure: {ex}" + ) + else: + if isinstance(value, dict): + _, proto = key.split("_", 1) + conf_dict.setdefault(proto.lower(), {}).update(value) + else: + warnings.warn( + f"Ignoring environment variable {key} due to not being a dict:" + f" {type(value)}" + ) + elif key.startswith("FSSPEC"): + warnings.warn( + f"Ignoring environment variable {key} due to having an unexpected name" + ) + + for key in kwarg_keys: + _, proto, kwarg = key.split("_", 2) + conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key] + + +def set_conf_files(cdir, conf_dict): + """Set config values from files + + Scans for INI and JSON files in the given dictionary, and uses their + contents to set the config. In case of repeated values, later values + win. + + In the case of INI files, all values are strings, and these will not + be converted. + + Parameters + ---------- + cdir : str + Directory to search + conf_dict : dict(str, dict) + This dict will be mutated + """ + if not os.path.isdir(cdir): + return + allfiles = sorted(os.listdir(cdir)) + for fn in allfiles: + if fn.endswith(".ini"): + ini = configparser.ConfigParser() + ini.read(os.path.join(cdir, fn)) + for key in ini: + if key == "DEFAULT": + continue + conf_dict.setdefault(key, {}).update(dict(ini[key])) + if fn.endswith(".json"): + with open(os.path.join(cdir, fn)) as f: + js = json.load(f) + for key in js: + conf_dict.setdefault(key, {}).update(dict(js[key])) + + +def apply_config(cls, kwargs, conf_dict=None): + """Supply default values for kwargs when instantiating class + + Augments the passed kwargs, by finding entries in the config dict + which match the classes ``.protocol`` attribute (one or more str) + + Parameters + ---------- + cls : file system implementation + kwargs : dict + conf_dict : dict of dict + Typically this is the global configuration + + Returns + ------- + dict : the modified set of kwargs + """ + if conf_dict is None: + conf_dict = conf + protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol] + kw = {} + for proto in protos: + # default kwargs from the current state of the config + if proto in conf_dict: + kw.update(conf_dict[proto]) + # explicit kwargs always win + kw.update(**kwargs) + kwargs = kw + return kwargs + + +set_conf_files(conf_dir, conf) +set_conf_env(conf) diff --git a/env/lib/python3.13/site-packages/fsspec/conftest.py b/env/lib/python3.13/site-packages/fsspec/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..f05eb5c30d42b0c1c5cc432f9c217d8f0e01f412 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/conftest.py @@ -0,0 +1,125 @@ +import os +import shutil +import subprocess +import sys +import time +from collections import deque +from collections.abc import Generator, Sequence + +import pytest + +import fsspec + + +@pytest.fixture() +def m(): + """ + Fixture providing a memory filesystem. + """ + m = fsspec.filesystem("memory") + m.store.clear() + m.pseudo_dirs.clear() + m.pseudo_dirs.append("") + try: + yield m + finally: + m.store.clear() + m.pseudo_dirs.clear() + m.pseudo_dirs.append("") + + +class InstanceCacheInspector: + """ + Helper class to inspect instance caches of filesystem classes in tests. + """ + + def clear(self) -> None: + """ + Clear instance caches of all currently imported filesystem classes. + """ + classes = deque([fsspec.spec.AbstractFileSystem]) + while classes: + cls = classes.popleft() + cls.clear_instance_cache() + classes.extend(cls.__subclasses__()) + + def gather_counts(self, *, omit_zero: bool = True) -> dict[str, int]: + """ + Gather counts of filesystem instances in the instance caches + of all currently imported filesystem classes. + + Parameters + ---------- + omit_zero: + Whether to omit instance types with no cached instances. + """ + out: dict[str, int] = {} + classes = deque([fsspec.spec.AbstractFileSystem]) + while classes: + cls = classes.popleft() + count = len(cls._cache) # there is no public interface for the cache + # note: skip intermediate AbstractFileSystem subclasses + # if they proxy the protocol attribute via a property. + if isinstance(cls.protocol, (Sequence, str)): + key = cls.protocol if isinstance(cls.protocol, str) else cls.protocol[0] + if count or not omit_zero: + out[key] = count + classes.extend(cls.__subclasses__()) + return out + + +@pytest.fixture(scope="function", autouse=True) +def instance_caches() -> Generator[InstanceCacheInspector, None, None]: + """ + Fixture to ensure empty filesystem instance caches before and after a test. + + Used by default for all tests. + Clears caches of all imported filesystem classes. + Can be used to write test assertions about instance caches. + + Usage: + + def test_something(instance_caches): + # Test code here + fsspec.open("file://abc") + fsspec.open("memory://foo/bar") + + # Test assertion + assert instance_caches.gather_counts() == {"file": 1, "memory": 1} + + Returns + ------- + instance_caches: An instance cache inspector for clearing and inspecting caches. + """ + ic = InstanceCacheInspector() + + ic.clear() + try: + yield ic + finally: + ic.clear() + + +@pytest.fixture(scope="function") +def ftp_writable(tmpdir): + """ + Fixture providing a writable FTP filesystem. + """ + pytest.importorskip("pyftpdlib") + + d = str(tmpdir) + with open(os.path.join(d, "out"), "wb") as f: + f.write(b"hello" * 10000) + P = subprocess.Popen( + [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] + ) + try: + time.sleep(1) + yield "localhost", 2121, "user", "pass" + finally: + P.terminate() + P.wait() + try: + shutil.rmtree(tmpdir) + except Exception: + pass diff --git a/env/lib/python3.13/site-packages/fsspec/core.py b/env/lib/python3.13/site-packages/fsspec/core.py new file mode 100644 index 0000000000000000000000000000000000000000..5876bfcefc176b3b3aed0e16b54fa3809a5a0eee --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/core.py @@ -0,0 +1,760 @@ +from __future__ import annotations + +import io +import logging +import os +import re +from glob import has_magic +from pathlib import Path + +# for backwards compat, we export cache things from here too +from fsspec.caching import ( # noqa: F401 + BaseCache, + BlockCache, + BytesCache, + MMapCache, + ReadAheadCache, + caches, +) +from fsspec.compression import compr +from fsspec.config import conf +from fsspec.registry import available_protocols, filesystem, get_filesystem_class +from fsspec.utils import ( + _unstrip_protocol, + build_name_function, + infer_compression, + stringify_path, +) + +logger = logging.getLogger("fsspec") + + +class OpenFile: + """ + File-like object to be used in a context + + Can layer (buffered) text-mode and compression over any file-system, which + are typically binary-only. + + These instances are safe to serialize, as the low-level file object + is not created until invoked using ``with``. + + Parameters + ---------- + fs: FileSystem + The file system to use for opening the file. Should be a subclass or duck-type + with ``fsspec.spec.AbstractFileSystem`` + path: str + Location to open + mode: str like 'rb', optional + Mode of the opened file + compression: str or None, optional + Compression to apply + encoding: str or None, optional + The encoding to use if opened in text mode. + errors: str or None, optional + How to handle encoding errors if opened in text mode. + newline: None or str + Passed to TextIOWrapper in text mode, how to handle line endings. + autoopen: bool + If True, calls open() immediately. Mostly used by pickle + pos: int + If given and autoopen is True, seek to this location immediately + """ + + def __init__( + self, + fs, + path, + mode="rb", + compression=None, + encoding=None, + errors=None, + newline=None, + ): + self.fs = fs + self.path = path + self.mode = mode + self.compression = get_compression(path, compression) + self.encoding = encoding + self.errors = errors + self.newline = newline + self.fobjects = [] + + def __reduce__(self): + return ( + OpenFile, + ( + self.fs, + self.path, + self.mode, + self.compression, + self.encoding, + self.errors, + self.newline, + ), + ) + + def __repr__(self): + return f"" + + def __enter__(self): + mode = self.mode.replace("t", "").replace("b", "") + "b" + + try: + f = self.fs.open(self.path, mode=mode) + except FileNotFoundError as e: + if has_magic(self.path): + raise FileNotFoundError( + "%s not found. The URL contains glob characters: you maybe needed\n" + "to pass expand=True in fsspec.open() or the storage_options of \n" + "your library. You can also set the config value 'open_expand'\n" + "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.", + self.path, + ) from e + raise + + self.fobjects = [f] + + if self.compression is not None: + compress = compr[self.compression] + f = compress(f, mode=mode[0]) + self.fobjects.append(f) + + if "b" not in self.mode: + # assume, for example, that 'r' is equivalent to 'rt' as in builtin + f = PickleableTextIOWrapper( + f, encoding=self.encoding, errors=self.errors, newline=self.newline + ) + self.fobjects.append(f) + + return self.fobjects[-1] + + def __exit__(self, *args): + self.close() + + @property + def full_name(self): + return _unstrip_protocol(self.path, self.fs) + + def open(self): + """Materialise this as a real open file without context + + The OpenFile object should be explicitly closed to avoid enclosed file + instances persisting. You must, therefore, keep a reference to the OpenFile + during the life of the file-like it generates. + """ + return self.__enter__() + + def close(self): + """Close all encapsulated file objects""" + for f in reversed(self.fobjects): + if "r" not in self.mode and not f.closed: + f.flush() + f.close() + self.fobjects.clear() + + +class OpenFiles(list): + """List of OpenFile instances + + Can be used in a single context, which opens and closes all of the + contained files. Normal list access to get the elements works as + normal. + + A special case is made for caching filesystems - the files will + be down/uploaded together at the start or end of the context, and + this may happen concurrently, if the target filesystem supports it. + """ + + def __init__(self, *args, mode="rb", fs=None): + self.mode = mode + self.fs = fs + self.files = [] + super().__init__(*args) + + def __enter__(self): + if self.fs is None: + raise ValueError("Context has already been used") + + fs = self.fs + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache download; or set up for upload + self.files = fs.open_many(self) + return self.files + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + return [s.__enter__() for s in self] + + def __exit__(self, *args): + fs = self.fs + [s.__exit__(*args) for s in self] + if "r" not in self.mode: + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache upload + fs.commit_many(self.files) + return + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + + def __getitem__(self, item): + out = super().__getitem__(item) + if isinstance(item, slice): + return OpenFiles(out, mode=self.mode, fs=self.fs) + return out + + def __repr__(self): + return f"" + + +def open_files( + urlpath, + mode="rb", + compression=None, + encoding="utf8", + errors=None, + name_function=None, + num=1, + protocol=None, + newline=None, + auto_mkdir=True, + expand=True, + **kwargs, +): + """Given a path or paths, return a list of ``OpenFile`` objects. + + For writing, a str path must contain the "*" character, which will be filled + in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. + + For either reading or writing, can instead provide explicit list of paths. + + Parameters + ---------- + urlpath: string or list + Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` + to read from alternative filesystems. To read from multiple files you + can pass a globstring or a list of paths, with the caveat that they + must all have the same protocol. + mode: 'rb', 'wt', etc. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding: str + For text mode only + errors: None or str + Passed to TextIOWrapper in text mode + name_function: function or None + if opening a set of files for writing, those files do not yet exist, + so we need to generate their names by formatting the urlpath for + each sequence number + num: int [1] + if writing mode, number of files we expect to create (passed to + name+function) + protocol: str or None + If given, overrides the protocol found in the URL. + newline: bytes or None + Used for line terminator in text mode. If None, uses system default; + if blank, uses no translation. + auto_mkdir: bool (True) + If in write mode, this will ensure the target directory exists before + writing, by calling ``fs.mkdirs(exist_ok=True)``. + expand: bool + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Examples + -------- + >>> files = open_files('2015-*-*.csv') # doctest: +SKIP + >>> files = open_files( + ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' + ... ) # doctest: +SKIP + + Returns + ------- + An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can + be used as a single context + + Notes + ----- + For a full list of the available protocols and the implementations that + they map across to see the latest online documentation: + + - For implementations built into ``fsspec`` see + https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations + - For implementations in separate packages see + https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + """ + fs, fs_token, paths = get_fs_token_paths( + urlpath, + mode, + num=num, + name_function=name_function, + storage_options=kwargs, + protocol=protocol, + expand=expand, + ) + if fs.protocol == "file": + fs.auto_mkdir = auto_mkdir + elif "r" not in mode and auto_mkdir: + parents = {fs._parent(path) for path in paths} + for parent in parents: + try: + fs.makedirs(parent, exist_ok=True) + except PermissionError: + pass + return OpenFiles( + [ + OpenFile( + fs, + path, + mode=mode, + compression=compression, + encoding=encoding, + errors=errors, + newline=newline, + ) + for path in paths + ], + mode=mode, + fs=fs, + ) + + +def _un_chain(path, kwargs): + # Avoid a circular import + from fsspec.implementations.chained import ChainedFileSystem + + if "::" in path: + x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word + known_protocols = set(available_protocols()) + bits = [] + + # split on '::', then ensure each bit has a protocol + for p in path.split("::"): + if p in known_protocols: + bits.append(p + "://") + elif "://" in p or x.match(p): + bits.append(p) + else: + bits.append(p + "://") + else: + bits = [path] + + # [[url, protocol, kwargs], ...] + out = [] + previous_bit = None + kwargs = kwargs.copy() + + for bit in reversed(bits): + protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file" + cls = get_filesystem_class(protocol) + extra_kwargs = cls._get_kwargs_from_urls(bit) + kws = kwargs.pop(protocol, {}) + + if bit is bits[0]: + kws.update(kwargs) + + kw = dict( + **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]}, + **kws, + ) + bit = cls._strip_protocol(bit) + + if ( + "target_protocol" not in kw + and issubclass(cls, ChainedFileSystem) + and not bit + ): + # replace bit if we are chaining and no path given + bit = previous_bit + + out.append((bit, protocol, kw)) + previous_bit = bit + + out.reverse() + return out + + +def url_to_fs(url, **kwargs): + """ + Turn fully-qualified and potentially chained URL into filesystem instance + + Parameters + ---------- + url : str + The fsspec-compatible URL + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Returns + ------- + filesystem : FileSystem + The new filesystem discovered from ``url`` and created with + ``**kwargs``. + urlpath : str + The file-systems-specific URL for ``url``. + """ + url = stringify_path(url) + # non-FS arguments that appear in fsspec.open() + # inspect could keep this in sync with open()'s signature + known_kwargs = { + "compression", + "encoding", + "errors", + "expand", + "mode", + "name_function", + "newline", + "num", + } + kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs} + chain = _un_chain(url, kwargs) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs["target_options"] = dict(**kw, **inkwargs) + inkwargs["target_protocol"] = protocol + inkwargs["fo"] = urls + urlpath, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + return fs, urlpath + + +DEFAULT_EXPAND = conf.get("open_expand", False) + + +def open( + urlpath, + mode="rb", + compression=None, + encoding="utf8", + errors=None, + protocol=None, + newline=None, + expand=None, + **kwargs, +): + """Given a path or paths, return one ``OpenFile`` object. + + Parameters + ---------- + urlpath: string or list + Absolute or relative filepath. Prefix with a protocol like ``s3://`` + to read from alternative filesystems. Should not include glob + character(s). + mode: 'rb', 'wt', etc. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding: str + For text mode only + errors: None or str + Passed to TextIOWrapper in text mode + protocol: str or None + If given, overrides the protocol found in the URL. + newline: bytes or None + Used for line terminator in text mode. If None, uses system default; + if blank, uses no translation. + expand: bool or None + Whether to regard file paths containing special glob characters as needing + expansion (finding the first match) or absolute. Setting False allows using + paths which do embed such characters. If None (default), this argument + takes its value from the DEFAULT_EXPAND module variable, which takes + its initial value from the "open_expand" config value at startup, which will + be False if not set. + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Examples + -------- + >>> openfile = open('2015-01-01.csv') # doctest: +SKIP + >>> openfile = open( + ... 's3://bucket/2015-01-01.csv.gz', compression='gzip' + ... ) # doctest: +SKIP + >>> with openfile as f: + ... df = pd.read_csv(f) # doctest: +SKIP + ... + + Returns + ------- + ``OpenFile`` object. + + Notes + ----- + For a full list of the available protocols and the implementations that + they map across to see the latest online documentation: + + - For implementations built into ``fsspec`` see + https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations + - For implementations in separate packages see + https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + """ + expand = DEFAULT_EXPAND if expand is None else expand + out = open_files( + urlpath=[urlpath], + mode=mode, + compression=compression, + encoding=encoding, + errors=errors, + protocol=protocol, + newline=newline, + expand=expand, + **kwargs, + ) + if not out: + raise FileNotFoundError(urlpath) + return out[0] + + +def open_local( + url: str | list[str] | Path | list[Path], + mode: str = "rb", + **storage_options: dict, +) -> str | list[str]: + """Open file(s) which can be resolved to local + + For files which either are local, or get downloaded upon open + (e.g., by file caching) + + Parameters + ---------- + url: str or list(str) + mode: str + Must be read mode + storage_options: + passed on to FS for or used by open_files (e.g., compression) + """ + if "r" not in mode: + raise ValueError("Can only ensure local files when reading") + of = open_files(url, mode=mode, **storage_options) + if not getattr(of[0].fs, "local_file", False): + raise ValueError( + "open_local can only be used on a filesystem which" + " has attribute local_file=True" + ) + with of as files: + paths = [f.name for f in files] + if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path): + return paths[0] + return paths + + +def get_compression(urlpath, compression): + if compression == "infer": + compression = infer_compression(urlpath) + if compression is not None and compression not in compr: + raise ValueError(f"Compression type {compression} not supported") + return compression + + +def split_protocol(urlpath): + """Return protocol, path pair""" + urlpath = stringify_path(urlpath) + if "://" in urlpath: + protocol, path = urlpath.split("://", 1) + if len(protocol) > 1: + # excludes Windows paths + return protocol, path + if urlpath.startswith("data:"): + return urlpath.split(":", 1) + return None, urlpath + + +def strip_protocol(urlpath): + """Return only path part of full URL, according to appropriate backend""" + protocol, _ = split_protocol(urlpath) + cls = get_filesystem_class(protocol) + return cls._strip_protocol(urlpath) + + +def expand_paths_if_needed(paths, mode, num, fs, name_function): + """Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]`` + in them (read mode). + + :param paths: list of paths + mode: str + Mode in which to open files. + num: int + If opening in writing mode, number of files we expect to create. + fs: filesystem object + name_function: callable + If opening in writing mode, this callable is used to generate path + names. Names are generated for each partition by + ``urlpath.replace('*', name_function(partition_index))``. + :return: list of paths + """ + expanded_paths = [] + paths = list(paths) + + if "w" in mode: # read mode + if sum(1 for p in paths if "*" in p) > 1: + raise ValueError( + "When writing data, only one filename mask can be specified." + ) + num = max(num, len(paths)) + + for curr_path in paths: + if "*" in curr_path: + # expand using name_function + expanded_paths.extend(_expand_paths(curr_path, name_function, num)) + else: + expanded_paths.append(curr_path) + # if we generated more paths that asked for, trim the list + if len(expanded_paths) > num: + expanded_paths = expanded_paths[:num] + + else: # read mode + for curr_path in paths: + if has_magic(curr_path): + # expand using glob + expanded_paths.extend(fs.glob(curr_path)) + else: + expanded_paths.append(curr_path) + + return expanded_paths + + +def get_fs_token_paths( + urlpath, + mode="rb", + num=1, + name_function=None, + storage_options=None, + protocol=None, + expand=True, +): + """Filesystem, deterministic token, and paths from a urlpath and options. + + Parameters + ---------- + urlpath: string or iterable + Absolute or relative filepath, URL (may include protocols like + ``s3://``), or globstring pointing to data. + mode: str, optional + Mode in which to open files. + num: int, optional + If opening in writing mode, number of files we expect to create. + name_function: callable, optional + If opening in writing mode, this callable is used to generate path + names. Names are generated for each partition by + ``urlpath.replace('*', name_function(partition_index))``. + storage_options: dict, optional + Additional keywords to pass to the filesystem class. + protocol: str or None + To override the protocol specifier in the URL + expand: bool + Expand string paths for writing, assuming the path is a directory + """ + if isinstance(urlpath, (list, tuple, set)): + if not urlpath: + raise ValueError("empty urlpath sequence") + urlpath0 = stringify_path(next(iter(urlpath))) + else: + urlpath0 = stringify_path(urlpath) + storage_options = storage_options or {} + if protocol: + storage_options["protocol"] = protocol + chain = _un_chain(urlpath0, storage_options or {}) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, nested_protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs["target_options"] = dict(**kw, **inkwargs) + inkwargs["target_protocol"] = nested_protocol + inkwargs["fo"] = urls + paths, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + if isinstance(urlpath, (list, tuple, set)): + pchains = [ + _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath + ] + if len({pc[1] for pc in pchains}) > 1: + raise ValueError("Protocol mismatch getting fs from %s", urlpath) + paths = [pc[0] for pc in pchains] + else: + paths = fs._strip_protocol(paths) + if isinstance(paths, (list, tuple, set)): + if expand: + paths = expand_paths_if_needed(paths, mode, num, fs, name_function) + elif not isinstance(paths, list): + paths = list(paths) + else: + if ("w" in mode or "x" in mode) and expand: + paths = _expand_paths(paths, name_function, num) + elif "*" in paths: + paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)] + else: + paths = [paths] + + return fs, fs._fs_token, paths + + +def _expand_paths(path, name_function, num): + if isinstance(path, str): + if path.count("*") > 1: + raise ValueError("Output path spec must contain exactly one '*'.") + elif "*" not in path: + path = os.path.join(path, "*.part") + + if name_function is None: + name_function = build_name_function(num - 1) + + paths = [path.replace("*", name_function(i)) for i in range(num)] + if paths != sorted(paths): + logger.warning( + "In order to preserve order between partitions" + " paths created with ``name_function`` should " + "sort to partition order" + ) + elif isinstance(path, (tuple, list)): + assert len(path) == num + paths = list(path) + else: + raise ValueError( + "Path should be either\n" + "1. A list of paths: ['foo.json', 'bar.json', ...]\n" + "2. A directory: 'foo/\n" + "3. A path with a '*' in it: 'foo.*.json'" + ) + return paths + + +class PickleableTextIOWrapper(io.TextIOWrapper): + """TextIOWrapper cannot be pickled. This solves it. + + Requires that ``buffer`` be pickleable, which all instances of + AbstractBufferedFile are. + """ + + def __init__( + self, + buffer, + encoding=None, + errors=None, + newline=None, + line_buffering=False, + write_through=False, + ): + self.args = buffer, encoding, errors, newline, line_buffering, write_through + super().__init__(*self.args) + + def __reduce__(self): + return PickleableTextIOWrapper, self.args diff --git a/env/lib/python3.13/site-packages/fsspec/dircache.py b/env/lib/python3.13/site-packages/fsspec/dircache.py new file mode 100644 index 0000000000000000000000000000000000000000..eca19566b135e5a7a4f6e7407d56411ec58bfe44 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/dircache.py @@ -0,0 +1,98 @@ +import time +from collections.abc import MutableMapping +from functools import lru_cache + + +class DirCache(MutableMapping): + """ + Caching of directory listings, in a structure like:: + + {"path0": [ + {"name": "path0/file0", + "size": 123, + "type": "file", + ... + }, + {"name": "path0/file1", + }, + ... + ], + "path1": [...] + } + + Parameters to this class control listing expiry or indeed turn + caching off + """ + + def __init__( + self, + use_listings_cache=True, + listings_expiry_time=None, + max_paths=None, + **kwargs, + ): + """ + + Parameters + ---------- + use_listings_cache: bool + If False, this cache never returns items, but always reports KeyError, + and setting items has no effect + listings_expiry_time: int or float (optional) + Time in seconds that a listing is considered valid. If None, + listings do not expire. + max_paths: int (optional) + The number of most recent listings that are considered valid; 'recent' + refers to when the entry was set. + """ + self._cache = {} + self._times = {} + if max_paths: + self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None)) + self.use_listings_cache = use_listings_cache + self.listings_expiry_time = listings_expiry_time + self.max_paths = max_paths + + def __getitem__(self, item): + if self.listings_expiry_time is not None: + if self._times.get(item, 0) - time.time() < -self.listings_expiry_time: + del self._cache[item] + if self.max_paths: + self._q(item) + return self._cache[item] # maybe raises KeyError + + def clear(self): + self._cache.clear() + + def __len__(self): + return len(self._cache) + + def __contains__(self, item): + try: + self[item] + return True + except KeyError: + return False + + def __setitem__(self, key, value): + if not self.use_listings_cache: + return + if self.max_paths: + self._q(key) + self._cache[key] = value + if self.listings_expiry_time is not None: + self._times[key] = time.time() + + def __delitem__(self, key): + del self._cache[key] + + def __iter__(self): + entries = list(self._cache) + + return (k for k in entries if k in self) + + def __reduce__(self): + return ( + DirCache, + (self.use_listings_cache, self.listings_expiry_time, self.max_paths), + ) diff --git a/env/lib/python3.13/site-packages/fsspec/exceptions.py b/env/lib/python3.13/site-packages/fsspec/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..ae8905475f02655f4fc5863931d99ca9da55db78 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/exceptions.py @@ -0,0 +1,18 @@ +""" +fsspec user-defined exception classes +""" + +import asyncio + + +class BlocksizeMismatchError(ValueError): + """ + Raised when a cached file is opened with a different blocksize than it was + written with + """ + + +class FSTimeoutError(asyncio.TimeoutError): + """ + Raised when a fsspec function timed out occurs + """ diff --git a/env/lib/python3.13/site-packages/fsspec/fuse.py b/env/lib/python3.13/site-packages/fsspec/fuse.py new file mode 100644 index 0000000000000000000000000000000000000000..566d520fce3e94e3bbaee48c3c6acc9f1db315a8 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/fuse.py @@ -0,0 +1,324 @@ +import argparse +import logging +import os +import stat +import threading +import time +from errno import EIO, ENOENT + +from fuse import FUSE, FuseOSError, LoggingMixIn, Operations + +from fsspec import __version__ +from fsspec.core import url_to_fs + +logger = logging.getLogger("fsspec.fuse") + + +class FUSEr(Operations): + def __init__(self, fs, path, ready_file=False): + self.fs = fs + self.cache = {} + self.root = path.rstrip("/") + "/" + self.counter = 0 + logger.info("Starting FUSE at %s", path) + self._ready_file = ready_file + + def getattr(self, path, fh=None): + logger.debug("getattr %s", path) + if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]: + return {"type": "file", "st_size": 5} + + path = "".join([self.root, path.lstrip("/")]).rstrip("/") + try: + info = self.fs.info(path) + except FileNotFoundError as exc: + raise FuseOSError(ENOENT) from exc + + data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)} + perm = info.get("mode", 0o777) + + if info["type"] != "file": + data["st_mode"] = stat.S_IFDIR | perm + data["st_size"] = 0 + data["st_blksize"] = 0 + else: + data["st_mode"] = stat.S_IFREG | perm + data["st_size"] = info["size"] + data["st_blksize"] = 5 * 2**20 + data["st_nlink"] = 1 + data["st_atime"] = info["atime"] if "atime" in info else time.time() + data["st_ctime"] = info["ctime"] if "ctime" in info else time.time() + data["st_mtime"] = info["mtime"] if "mtime" in info else time.time() + return data + + def readdir(self, path, fh): + logger.debug("readdir %s", path) + path = "".join([self.root, path.lstrip("/")]) + files = self.fs.ls(path, False) + files = [os.path.basename(f.rstrip("/")) for f in files] + return [".", ".."] + files + + def mkdir(self, path, mode): + path = "".join([self.root, path.lstrip("/")]) + self.fs.mkdir(path) + return 0 + + def rmdir(self, path): + path = "".join([self.root, path.lstrip("/")]) + self.fs.rmdir(path) + return 0 + + def read(self, path, size, offset, fh): + logger.debug("read %s", (path, size, offset)) + if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]: + # status indicator + return b"ready" + + f = self.cache[fh] + f.seek(offset) + out = f.read(size) + return out + + def write(self, path, data, offset, fh): + logger.debug("write %s", (path, offset)) + f = self.cache[fh] + f.seek(offset) + f.write(data) + return len(data) + + def create(self, path, flags, fi=None): + logger.debug("create %s", (path, flags)) + fn = "".join([self.root, path.lstrip("/")]) + self.fs.touch(fn) # OS will want to get attributes immediately + f = self.fs.open(fn, "wb") + self.cache[self.counter] = f + self.counter += 1 + return self.counter - 1 + + def open(self, path, flags): + logger.debug("open %s", (path, flags)) + fn = "".join([self.root, path.lstrip("/")]) + if flags % 2 == 0: + # read + mode = "rb" + else: + # write/create + mode = "wb" + self.cache[self.counter] = self.fs.open(fn, mode) + self.counter += 1 + return self.counter - 1 + + def truncate(self, path, length, fh=None): + fn = "".join([self.root, path.lstrip("/")]) + if length != 0: + raise NotImplementedError + # maybe should be no-op since open with write sets size to zero anyway + self.fs.touch(fn) + + def unlink(self, path): + fn = "".join([self.root, path.lstrip("/")]) + try: + self.fs.rm(fn, False) + except (OSError, FileNotFoundError) as exc: + raise FuseOSError(EIO) from exc + + def release(self, path, fh): + try: + if fh in self.cache: + f = self.cache[fh] + f.close() + self.cache.pop(fh) + except Exception as e: + print(e) + return 0 + + def chmod(self, path, mode): + if hasattr(self.fs, "chmod"): + path = "".join([self.root, path.lstrip("/")]) + return self.fs.chmod(path, mode) + raise NotImplementedError + + +def run( + fs, + path, + mount_point, + foreground=True, + threads=False, + ready_file=False, + ops_class=FUSEr, +): + """Mount stuff in a local directory + + This uses fusepy to make it appear as if a given path on an fsspec + instance is in fact resident within the local file-system. + + This requires that fusepy by installed, and that FUSE be available on + the system (typically requiring a package to be installed with + apt, yum, brew, etc.). + + Parameters + ---------- + fs: file-system instance + From one of the compatible implementations + path: str + Location on that file-system to regard as the root directory to + mount. Note that you typically should include the terminating "/" + character. + mount_point: str + An empty directory on the local file-system where the contents of + the remote path will appear. + foreground: bool + Whether or not calling this function will block. Operation will + typically be more stable if True. + threads: bool + Whether or not to create threads when responding to file operations + within the mounter directory. Operation will typically be more + stable if False. + ready_file: bool + Whether the FUSE process is ready. The ``.fuse_ready`` file will + exist in the ``mount_point`` directory if True. Debugging purpose. + ops_class: FUSEr or Subclass of FUSEr + To override the default behavior of FUSEr. For Example, logging + to file. + + """ + func = lambda: FUSE( + ops_class(fs, path, ready_file=ready_file), + mount_point, + nothreads=not threads, + foreground=foreground, + ) + if not foreground: + th = threading.Thread(target=func) + th.daemon = True + th.start() + return th + else: # pragma: no cover + try: + func() + except KeyboardInterrupt: + pass + + +def main(args): + """Mount filesystem from chained URL to MOUNT_POINT. + + Examples: + + python3 -m fsspec.fuse memory /usr/share /tmp/mem + + python3 -m fsspec.fuse local /tmp/source /tmp/local \\ + -l /tmp/fsspecfuse.log + + You can also mount chained-URLs and use special settings: + + python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\ + / /tmp/zip \\ + -o 'filecache-cache_storage=/tmp/simplecache' + + You can specify the type of the setting by using `[int]` or `[bool]`, + (`true`, `yes`, `1` represents the Boolean value `True`): + + python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\ + /historic/packages/RPMS /tmp/ftp \\ + -o 'simplecache-cache_storage=/tmp/simplecache' \\ + -o 'simplecache-check_files=false[bool]' \\ + -o 'ftp-listings_expiry_time=60[int]' \\ + -o 'ftp-username=anonymous' \\ + -o 'ftp-password=xieyanbo' + """ + + class RawDescriptionArgumentParser(argparse.ArgumentParser): + def format_help(self): + usage = super().format_help() + parts = usage.split("\n\n") + parts[1] = self.description.rstrip() + return "\n\n".join(parts) + + parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__) + parser.add_argument("--version", action="version", version=__version__) + parser.add_argument("url", type=str, help="fs url") + parser.add_argument("source_path", type=str, help="source directory in fs") + parser.add_argument("mount_point", type=str, help="local directory") + parser.add_argument( + "-o", + "--option", + action="append", + help="Any options of protocol included in the chained URL", + ) + parser.add_argument( + "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')" + ) + parser.add_argument( + "-f", + "--foreground", + action="store_false", + help="Running in foreground or not (Default: False)", + ) + parser.add_argument( + "-t", + "--threads", + action="store_false", + help="Running with threads support (Default: False)", + ) + parser.add_argument( + "-r", + "--ready-file", + action="store_false", + help="The `.fuse_ready` file will exist after FUSE is ready. " + "(Debugging purpose, Default: False)", + ) + args = parser.parse_args(args) + + kwargs = {} + for item in args.option or []: + key, sep, value = item.partition("=") + if not sep: + parser.error(message=f"Wrong option: {item!r}") + val = value.lower() + if val.endswith("[int]"): + value = int(value[: -len("[int]")]) + elif val.endswith("[bool]"): + value = val[: -len("[bool]")] in ["1", "yes", "true"] + + if "-" in key: + fs_name, setting_name = key.split("-", 1) + if fs_name in kwargs: + kwargs[fs_name][setting_name] = value + else: + kwargs[fs_name] = {setting_name: value} + else: + kwargs[key] = value + + if args.log_file: + logging.basicConfig( + level=logging.DEBUG, + filename=args.log_file, + format="%(asctime)s %(message)s", + ) + + class LoggingFUSEr(FUSEr, LoggingMixIn): + pass + + fuser = LoggingFUSEr + else: + fuser = FUSEr + + fs, url_path = url_to_fs(args.url, **kwargs) + logger.debug("Mounting %s to %s", url_path, str(args.mount_point)) + run( + fs, + args.source_path, + args.mount_point, + foreground=args.foreground, + threads=args.threads, + ready_file=args.ready_file, + ops_class=fuser, + ) + + +if __name__ == "__main__": + import sys + + main(sys.argv[1:]) diff --git a/env/lib/python3.13/site-packages/fsspec/generic.py b/env/lib/python3.13/site-packages/fsspec/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..0a641b0e2bcf70729a44064319eecb3647450379 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/generic.py @@ -0,0 +1,396 @@ +from __future__ import annotations + +import inspect +import logging +import os +import shutil +import uuid + +from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper +from .callbacks import DEFAULT_CALLBACK +from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs + +_generic_fs = {} +logger = logging.getLogger("fsspec.generic") + + +def set_generic_fs(protocol, **storage_options): + """Populate the dict used for method=="generic" lookups""" + _generic_fs[protocol] = filesystem(protocol, **storage_options) + + +def _resolve_fs(url, method, protocol=None, storage_options=None): + """Pick instance of backend FS""" + url = url[0] if isinstance(url, (list, tuple)) else url + protocol = protocol or split_protocol(url)[0] + storage_options = storage_options or {} + if method == "default": + return filesystem(protocol) + if method == "generic": + return _generic_fs[protocol] + if method == "current": + cls = get_filesystem_class(protocol) + return cls.current() + if method == "options": + fs, _ = url_to_fs(url, **storage_options.get(protocol, {})) + return fs + raise ValueError(f"Unknown FS resolution method: {method}") + + +def rsync( + source, + destination, + delete_missing=False, + source_field="size", + dest_field="size", + update_cond="different", + inst_kwargs=None, + fs=None, + **kwargs, +): + """Sync files between two directory trees + + (experimental) + + Parameters + ---------- + source: str + Root of the directory tree to take files from. This must be a directory, but + do not include any terminating "/" character + destination: str + Root path to copy into. The contents of this location should be + identical to the contents of ``source`` when done. This will be made a + directory, and the terminal "/" should not be included. + delete_missing: bool + If there are paths in the destination that don't exist in the + source and this is True, delete them. Otherwise, leave them alone. + source_field: str | callable + If ``update_field`` is "different", this is the key in the info + of source files to consider for difference. Maybe a function of the + info dict. + dest_field: str | callable + If ``update_field`` is "different", this is the key in the info + of destination files to consider for difference. May be a function of + the info dict. + update_cond: "different"|"always"|"never" + If "always", every file is copied, regardless of whether it exists in + the destination. If "never", files that exist in the destination are + not copied again. If "different" (default), only copy if the info + fields given by ``source_field`` and ``dest_field`` (usually "size") + are different. Other comparisons may be added in the future. + inst_kwargs: dict|None + If ``fs`` is None, use this set of keyword arguments to make a + GenericFileSystem instance + fs: GenericFileSystem|None + Instance to use if explicitly given. The instance defines how to + to make downstream file system instances from paths. + + Returns + ------- + dict of the copy operations that were performed, {source: destination} + """ + fs = fs or GenericFileSystem(**(inst_kwargs or {})) + source = fs._strip_protocol(source) + destination = fs._strip_protocol(destination) + allfiles = fs.find(source, withdirs=True, detail=True) + if not fs.isdir(source): + raise ValueError("Can only rsync on a directory") + otherfiles = fs.find(destination, withdirs=True, detail=True) + dirs = [ + a + for a, v in allfiles.items() + if v["type"] == "directory" and a.replace(source, destination) not in otherfiles + ] + logger.debug(f"{len(dirs)} directories to create") + if dirs: + fs.make_many_dirs( + [dirn.replace(source, destination) for dirn in dirs], exist_ok=True + ) + allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"} + logger.debug(f"{len(allfiles)} files to consider for copy") + to_delete = [ + o + for o, v in otherfiles.items() + if o.replace(destination, source) not in allfiles and v["type"] == "file" + ] + for k, v in allfiles.copy().items(): + otherfile = k.replace(source, destination) + if otherfile in otherfiles: + if update_cond == "always": + allfiles[k] = otherfile + elif update_cond == "never": + allfiles.pop(k) + elif update_cond == "different": + inf1 = source_field(v) if callable(source_field) else v[source_field] + v2 = otherfiles[otherfile] + inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field] + if inf1 != inf2: + # details mismatch, make copy + allfiles[k] = otherfile + else: + # details match, don't copy + allfiles.pop(k) + else: + # file not in target yet + allfiles[k] = otherfile + logger.debug(f"{len(allfiles)} files to copy") + if allfiles: + source_files, target_files = zip(*allfiles.items()) + fs.cp(source_files, target_files, **kwargs) + logger.debug(f"{len(to_delete)} files to delete") + if delete_missing and to_delete: + fs.rm(to_delete) + return allfiles + + +class GenericFileSystem(AsyncFileSystem): + """Wrapper over all other FS types + + + + This implementation is a single unified interface to be able to run FS operations + over generic URLs, and dispatch to the specific implementations using the URL + protocol prefix. + + Note: instances of this FS are always async, even if you never use it with any async + backend. + """ + + protocol = "generic" # there is no real reason to ever use a protocol with this FS + + def __init__(self, default_method="default", storage_options=None, **kwargs): + """ + + Parameters + ---------- + default_method: str (optional) + Defines how to configure backend FS instances. Options are: + - "default": instantiate like FSClass(), with no + extra arguments; this is the default instance of that FS, and can be + configured via the config system + - "generic": takes instances from the `_generic_fs` dict in this module, + which you must populate before use. Keys are by protocol + - "options": expects storage_options, a dict mapping protocol to + kwargs to use when constructing the filesystem + - "current": takes the most recently instantiated version of each FS + """ + self.method = default_method + self.st_opts = storage_options + super().__init__(**kwargs) + + def _parent(self, path): + fs = _resolve_fs(path, self.method, storage_options=self.st_opts) + return fs.unstrip_protocol(fs._parent(path)) + + def _strip_protocol(self, path): + # normalization only + fs = _resolve_fs(path, self.method, storage_options=self.st_opts) + return fs.unstrip_protocol(fs._strip_protocol(path)) + + async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + fs = _resolve_fs(path, self.method, storage_options=self.st_opts) + if fs.async_impl: + out = await fs._find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs + ) + else: + out = fs.find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs + ) + result = {} + for k, v in out.items(): + v = v.copy() # don't corrupt target FS dircache + name = fs.unstrip_protocol(k) + v["name"] = name + result[name] = v + if detail: + return result + return list(result) + + async def _info(self, url, **kwargs): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + out = await fs._info(url, **kwargs) + else: + out = fs.info(url, **kwargs) + out = out.copy() # don't edit originals + out["name"] = fs.unstrip_protocol(out["name"]) + return out + + async def _ls( + self, + url, + detail=True, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + out = await fs._ls(url, detail=True, **kwargs) + else: + out = fs.ls(url, detail=True, **kwargs) + out = [o.copy() for o in out] # don't edit originals + for o in out: + o["name"] = fs.unstrip_protocol(o["name"]) + if detail: + return out + else: + return [o["name"] for o in out] + + async def _cat_file( + self, + url, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + return await fs._cat_file(url, **kwargs) + else: + return fs.cat_file(url, **kwargs) + + async def _pipe_file( + self, + path, + value, + **kwargs, + ): + fs = _resolve_fs(path, self.method, storage_options=self.st_opts) + if fs.async_impl: + return await fs._pipe_file(path, value, **kwargs) + else: + return fs.pipe_file(path, value, **kwargs) + + async def _rm(self, url, **kwargs): + urls = url + if isinstance(urls, str): + urls = [urls] + fs = _resolve_fs(urls[0], self.method) + if fs.async_impl: + await fs._rm(urls, **kwargs) + else: + fs.rm(url, **kwargs) + + async def _makedirs(self, path, exist_ok=False): + logger.debug("Make dir %s", path) + fs = _resolve_fs(path, self.method, storage_options=self.st_opts) + if fs.async_impl: + await fs._makedirs(path, exist_ok=exist_ok) + else: + fs.makedirs(path, exist_ok=exist_ok) + + def rsync(self, source, destination, **kwargs): + """Sync files between two directory trees + + See `func:rsync` for more details. + """ + rsync(source, destination, fs=self, **kwargs) + + async def _cp_file( + self, + url, + url2, + blocksize=2**20, + callback=DEFAULT_CALLBACK, + tempdir: str | None = None, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + fs2 = _resolve_fs(url2, self.method) + if fs is fs2: + # pure remote + if fs.async_impl: + return await fs._copy(url, url2, **kwargs) + else: + return fs.copy(url, url2, **kwargs) + await copy_file_op(fs, [url], fs2, [url2], tempdir, 1, on_error="raise") + + async def _make_many_dirs(self, urls, exist_ok=True): + fs = _resolve_fs(urls[0], self.method) + if fs.async_impl: + coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls] + await _run_coros_in_chunks(coros) + else: + for u in urls: + fs.makedirs(u, exist_ok=exist_ok) + + make_many_dirs = sync_wrapper(_make_many_dirs) + + async def _copy( + self, + path1: list[str], + path2: list[str], + recursive: bool = False, + on_error: str = "ignore", + maxdepth: int | None = None, + batch_size: int | None = None, + tempdir: str | None = None, + **kwargs, + ): + # TODO: special case for one FS being local, which can use get/put + # TODO: special case for one being memFS, which can use cat/pipe + if recursive: + raise NotImplementedError("Please use fsspec.generic.rsync") + path1 = [path1] if isinstance(path1, str) else path1 + path2 = [path2] if isinstance(path2, str) else path2 + + fs = _resolve_fs(path1, self.method) + fs2 = _resolve_fs(path2, self.method) + + if fs is fs2: + if fs.async_impl: + return await fs._copy(path1, path2, **kwargs) + else: + return fs.copy(path1, path2, **kwargs) + + await copy_file_op( + fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error + ) + + +async def copy_file_op( + fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore" +): + import tempfile + + tempdir = tempdir or tempfile.mkdtemp() + try: + coros = [ + _copy_file_op( + fs1, + u1, + fs2, + u2, + os.path.join(tempdir, uuid.uuid4().hex), + ) + for u1, u2 in zip(url1, url2) + ] + out = await _run_coros_in_chunks( + coros, batch_size=batch_size, return_exceptions=True + ) + finally: + shutil.rmtree(tempdir) + if on_error == "return": + return out + elif on_error == "raise": + for o in out: + if isinstance(o, Exception): + raise o + + +async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"): + if fs1.async_impl: + await fs1._get_file(url1, local) + else: + fs1.get_file(url1, local) + if fs2.async_impl: + await fs2._put_file(local, url2) + else: + fs2.put_file(local, url2) + os.unlink(local) + logger.debug("Copy %s -> %s; done", url1, url2) + + +async def maybe_await(cor): + if inspect.iscoroutine(cor): + return await cor + else: + return cor diff --git a/env/lib/python3.13/site-packages/fsspec/gui.py b/env/lib/python3.13/site-packages/fsspec/gui.py new file mode 100644 index 0000000000000000000000000000000000000000..9d914c8beb6cabb2c2700eb8eee31028559be2bd --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/gui.py @@ -0,0 +1,417 @@ +import ast +import contextlib +import logging +import os +import re +from collections.abc import Sequence +from typing import ClassVar + +import panel as pn + +from .core import OpenFile, get_filesystem_class, split_protocol +from .registry import known_implementations + +pn.extension() +logger = logging.getLogger("fsspec.gui") + + +class SigSlot: + """Signal-slot mixin, for Panel event passing + + Include this class in a widget manager's superclasses to be able to + register events and callbacks on Panel widgets managed by that class. + + The method ``_register`` should be called as widgets are added, and external + code should call ``connect`` to associate callbacks. + + By default, all signals emit a DEBUG logging statement. + """ + + # names of signals that this class may emit each of which must be + # set by _register for any new instance + signals: ClassVar[Sequence[str]] = [] + # names of actions that this class may respond to + slots: ClassVar[Sequence[str]] = [] + + # each of which must be a method name + + def __init__(self): + self._ignoring_events = False + self._sigs = {} + self._map = {} + self._setup() + + def _setup(self): + """Create GUI elements and register signals""" + self.panel = pn.pane.PaneBase() + # no signals to set up in the base class + + def _register( + self, widget, name, thing="value", log_level=logging.DEBUG, auto=False + ): + """Watch the given attribute of a widget and assign it a named event + + This is normally called at the time a widget is instantiated, in the + class which owns it. + + Parameters + ---------- + widget : pn.layout.Panel or None + Widget to watch. If None, an anonymous signal not associated with + any widget. + name : str + Name of this event + thing : str + Attribute of the given widget to watch + log_level : int + When the signal is triggered, a logging event of the given level + will be fired in the dfviz logger. + auto : bool + If True, automatically connects with a method in this class of the + same name. + """ + if name not in self.signals: + raise ValueError(f"Attempt to assign an undeclared signal: {name}") + self._sigs[name] = { + "widget": widget, + "callbacks": [], + "thing": thing, + "log": log_level, + } + wn = "-".join( + [ + getattr(widget, "name", str(widget)) if widget is not None else "none", + thing, + ] + ) + self._map[wn] = name + if widget is not None: + widget.param.watch(self._signal, thing, onlychanged=True) + if auto and hasattr(self, name): + self.connect(name, getattr(self, name)) + + def _repr_mimebundle_(self, *args, **kwargs): + """Display in a notebook or a server""" + try: + return self.panel._repr_mimebundle_(*args, **kwargs) + except (ValueError, AttributeError) as exc: + raise NotImplementedError( + "Panel does not seem to be set up properly" + ) from exc + + def connect(self, signal, slot): + """Associate call back with given event + + The callback must be a function which takes the "new" value of the + watched attribute as the only parameter. If the callback return False, + this cancels any further processing of the given event. + + Alternatively, the callback can be a string, in which case it means + emitting the correspondingly-named event (i.e., connect to self) + """ + self._sigs[signal]["callbacks"].append(slot) + + def _signal(self, event): + """This is called by a an action on a widget + + Within an self.ignore_events context, nothing happens. + + Tests can execute this method by directly changing the values of + widget components. + """ + if not self._ignoring_events: + wn = "-".join([event.obj.name, event.name]) + if wn in self._map and self._map[wn] in self._sigs: + self._emit(self._map[wn], event.new) + + @contextlib.contextmanager + def ignore_events(self): + """Temporarily turn off events processing in this instance + + (does not propagate to children) + """ + self._ignoring_events = True + try: + yield + finally: + self._ignoring_events = False + + def _emit(self, sig, value=None): + """An event happened, call its callbacks + + This method can be used in tests to simulate message passing without + directly changing visual elements. + + Calling of callbacks will halt whenever one returns False. + """ + logger.log(self._sigs[sig]["log"], f"{sig}: {value}") + for callback in self._sigs[sig]["callbacks"]: + if isinstance(callback, str): + self._emit(callback) + else: + try: + # running callbacks should not break the interface + ret = callback(value) + if ret is False: + break + except Exception as e: + logger.exception( + "Exception (%s) while executing callback for signal: %s", + e, + sig, + ) + + def show(self, threads=False): + """Open a new browser tab and display this instance's interface""" + self.panel.show(threads=threads, verbose=False) + return self + + +class SingleSelect(SigSlot): + """A multiselect which only allows you to select one item for an event""" + + signals = ["_selected", "selected"] # the first is internal + slots = ["set_options", "set_selection", "add", "clear", "select"] + + def __init__(self, **kwargs): + self.kwargs = kwargs + super().__init__() + + def _setup(self): + self.panel = pn.widgets.MultiSelect(**self.kwargs) + self._register(self.panel, "_selected", "value") + self._register(None, "selected") + self.connect("_selected", self.select_one) + + def _signal(self, *args, **kwargs): + super()._signal(*args, **kwargs) + + def select_one(self, *_): + with self.ignore_events(): + val = [self.panel.value[-1]] if self.panel.value else [] + self.panel.value = val + self._emit("selected", self.panel.value) + + def set_options(self, options): + self.panel.options = options + + def clear(self): + self.panel.options = [] + + @property + def value(self): + return self.panel.value + + def set_selection(self, selection): + self.panel.value = [selection] + + +class FileSelector(SigSlot): + """Panel-based graphical file selector widget + + Instances of this widget are interactive and can be displayed in jupyter by having + them as the output of a cell, or in a separate browser tab using ``.show()``. + """ + + signals = [ + "protocol_changed", + "selection_changed", + "directory_entered", + "home_clicked", + "up_clicked", + "go_clicked", + "filters_changed", + ] + slots = ["set_filters", "go_home"] + + def __init__(self, url=None, filters=None, ignore=None, kwargs=None): + """ + + Parameters + ---------- + url : str (optional) + Initial value of the URL to populate the dialog; should include protocol + filters : list(str) (optional) + File endings to include in the listings. If not included, all files are + allowed. Does not affect directories. + If given, the endings will appear as checkboxes in the interface + ignore : list(str) (optional) + Regex(s) of file basename patterns to ignore, e.g., "\\." for typical + hidden files on posix + kwargs : dict (optional) + To pass to file system instance + """ + if url: + self.init_protocol, url = split_protocol(url) + else: + self.init_protocol, url = "file", os.getcwd() + self.init_url = url + self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}" + self.filters = filters + self.ignore = [re.compile(i) for i in ignore or []] + self._fs = None + super().__init__() + + def _setup(self): + self.url = pn.widgets.TextInput( + name="url", + value=self.init_url, + align="end", + sizing_mode="stretch_width", + width_policy="max", + ) + self.protocol = pn.widgets.Select( + options=sorted(known_implementations), + value=self.init_protocol, + name="protocol", + align="center", + ) + self.kwargs = pn.widgets.TextInput( + name="kwargs", value=self.init_kwargs, align="center" + ) + self.go = pn.widgets.Button(name="⇨", align="end", width=45) + self.main = SingleSelect(size=10) + self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end") + self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end") + + self._register(self.protocol, "protocol_changed", auto=True) + self._register(self.go, "go_clicked", "clicks", auto=True) + self._register(self.up, "up_clicked", "clicks", auto=True) + self._register(self.home, "home_clicked", "clicks", auto=True) + self._register(None, "selection_changed") + self.main.connect("selected", self.selection_changed) + self._register(None, "directory_entered") + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + + self.filter_sel = pn.widgets.CheckBoxGroup( + value=[], options=[], inline=False, align="end", width_policy="min" + ) + self._register(self.filter_sel, "filters_changed", auto=True) + + self.panel = pn.Column( + pn.Row(self.protocol, self.kwargs), + pn.Row(self.home, self.up, self.url, self.go, self.filter_sel), + self.main.panel, + ) + self.set_filters(self.filters) + self.go_clicked() + + def set_filters(self, filters=None): + self.filters = filters + if filters: + self.filter_sel.options = filters + self.filter_sel.value = filters + else: + self.filter_sel.options = [] + self.filter_sel.value = [] + + @property + def storage_options(self): + """Value of the kwargs box as a dictionary""" + return ast.literal_eval(self.kwargs.value) or {} + + @property + def fs(self): + """Current filesystem instance""" + if self._fs is None: + cls = get_filesystem_class(self.protocol.value) + self._fs = cls(**self.storage_options) + return self._fs + + @property + def urlpath(self): + """URL of currently selected item""" + return ( + (f"{self.protocol.value}://{self.main.value[0]}") + if self.main.value + else None + ) + + def open_file(self, mode="rb", compression=None, encoding=None): + """Create OpenFile instance for the currently selected item + + For example, in a notebook you might do something like + + .. code-block:: + + [ ]: sel = FileSelector(); sel + + # user selects their file + + [ ]: with sel.open_file('rb') as f: + ... out = f.read() + + Parameters + ---------- + mode: str (optional) + Open mode for the file. + compression: str (optional) + The interact with the file as compressed. Set to 'infer' to guess + compression from the file ending + encoding: str (optional) + If using text mode, use this encoding; defaults to UTF8. + """ + if self.urlpath is None: + raise ValueError("No file selected") + return OpenFile(self.fs, self.urlpath, mode, compression, encoding) + + def filters_changed(self, values): + self.filters = values + self.go_clicked() + + def selection_changed(self, *_): + if self.urlpath is None: + return + if self.fs.isdir(self.urlpath): + self.url.value = self.fs._strip_protocol(self.urlpath) + self.go_clicked() + + def go_clicked(self, *_): + if ( + self.prev_protocol != self.protocol.value + or self.prev_kwargs != self.storage_options + ): + self._fs = None # causes fs to be recreated + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + listing = sorted( + self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"] + ) + listing = [ + l + for l in listing + if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore) + ] + folders = { + "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "directory" + } + files = { + "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "file" + } + if self.filters: + files = { + k: v + for k, v in files.items() + if any(v.endswith(ext) for ext in self.filters) + } + self.main.set_options(dict(**folders, **files)) + + def protocol_changed(self, *_): + self._fs = None + self.main.options = [] + self.url.value = "" + + def home_clicked(self, *_): + self.protocol.value = self.init_protocol + self.kwargs.value = self.init_kwargs + self.url.value = self.init_url + self.go_clicked() + + def up_clicked(self, *_): + self.url.value = self.fs._parent(self.url.value) + self.go_clicked() diff --git a/env/lib/python3.13/site-packages/fsspec/json.py b/env/lib/python3.13/site-packages/fsspec/json.py new file mode 100644 index 0000000000000000000000000000000000000000..5c53a24913d0b28f4b53a163b97ff8f58abeb031 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/json.py @@ -0,0 +1,112 @@ +import json +from collections.abc import Callable, Mapping, Sequence +from contextlib import suppress +from pathlib import PurePath +from typing import Any, ClassVar + +from .registry import _import_class, get_filesystem_class +from .spec import AbstractFileSystem + + +class FilesystemJSONEncoder(json.JSONEncoder): + include_password: ClassVar[bool] = True + + def default(self, o: Any) -> Any: + if isinstance(o, AbstractFileSystem): + return o.to_dict(include_password=self.include_password) + if isinstance(o, PurePath): + cls = type(o) + return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)} + + return super().default(o) + + def make_serializable(self, obj: Any) -> Any: + """ + Recursively converts an object so that it can be JSON serialized via + :func:`json.dumps` and :func:`json.dump`, without actually calling + said functions. + """ + if isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, Mapping): + return {k: self.make_serializable(v) for k, v in obj.items()} + if isinstance(obj, Sequence): + return [self.make_serializable(v) for v in obj] + + return self.default(obj) + + +class FilesystemJSONDecoder(json.JSONDecoder): + def __init__( + self, + *, + object_hook: Callable[[dict[str, Any]], Any] | None = None, + parse_float: Callable[[str], Any] | None = None, + parse_int: Callable[[str], Any] | None = None, + parse_constant: Callable[[str], Any] | None = None, + strict: bool = True, + object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None, + ) -> None: + self.original_object_hook = object_hook + + super().__init__( + object_hook=self.custom_object_hook, + parse_float=parse_float, + parse_int=parse_int, + parse_constant=parse_constant, + strict=strict, + object_pairs_hook=object_pairs_hook, + ) + + @classmethod + def try_resolve_path_cls(cls, dct: dict[str, Any]): + with suppress(Exception): + fqp = dct["cls"] + + path_cls = _import_class(fqp) + + if issubclass(path_cls, PurePath): + return path_cls + + return None + + @classmethod + def try_resolve_fs_cls(cls, dct: dict[str, Any]): + with suppress(Exception): + if "cls" in dct: + try: + fs_cls = _import_class(dct["cls"]) + if issubclass(fs_cls, AbstractFileSystem): + return fs_cls + except Exception: + if "protocol" in dct: # Fallback if cls cannot be imported + return get_filesystem_class(dct["protocol"]) + + raise + + return None + + def custom_object_hook(self, dct: dict[str, Any]): + if "cls" in dct: + if (obj_cls := self.try_resolve_fs_cls(dct)) is not None: + return AbstractFileSystem.from_dict(dct) + if (obj_cls := self.try_resolve_path_cls(dct)) is not None: + return obj_cls(dct["str"]) + + if self.original_object_hook is not None: + return self.original_object_hook(dct) + + return dct + + def unmake_serializable(self, obj: Any) -> Any: + """ + Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`. + """ + if isinstance(obj, dict): + obj = self.custom_object_hook(obj) + if isinstance(obj, dict): + return {k: self.unmake_serializable(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [self.unmake_serializable(v) for v in obj] + + return obj diff --git a/env/lib/python3.13/site-packages/fsspec/mapping.py b/env/lib/python3.13/site-packages/fsspec/mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..752eef35273b13eded7297e2e801b58e436a25b1 --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/mapping.py @@ -0,0 +1,251 @@ +import array +import logging +import posixpath +import warnings +from collections.abc import MutableMapping +from functools import cached_property + +from fsspec.core import url_to_fs + +logger = logging.getLogger("fsspec.mapping") + + +class FSMap(MutableMapping): + """Wrap a FileSystem instance as a mutable wrapping. + + The keys of the mapping become files under the given root, and the + values (which must be bytes) the contents of those files. + + Parameters + ---------- + root: string + prefix for all the files + fs: FileSystem instance + check: bool (=True) + performs a touch at the location, to check for write access. + + Examples + -------- + >>> fs = FileSystem(**parameters) # doctest: +SKIP + >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP + or, more likely + >>> d = fs.get_mapper('my-data/path/') + + >>> d['loc1'] = b'Hello World' # doctest: +SKIP + >>> list(d.keys()) # doctest: +SKIP + ['loc1'] + >>> d['loc1'] # doctest: +SKIP + b'Hello World' + """ + + def __init__(self, root, fs, check=False, create=False, missing_exceptions=None): + self.fs = fs + self.root = fs._strip_protocol(root) + self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1] + if missing_exceptions is None: + missing_exceptions = ( + FileNotFoundError, + IsADirectoryError, + NotADirectoryError, + ) + self.missing_exceptions = missing_exceptions + self.check = check + self.create = create + if create: + if not self.fs.exists(root): + self.fs.mkdir(root) + if check: + if not self.fs.exists(root): + raise ValueError( + f"Path {root} does not exist. Create " + f" with the ``create=True`` keyword" + ) + self.fs.touch(root + "/a") + self.fs.rm(root + "/a") + + @cached_property + def dirfs(self): + """dirfs instance that can be used with the same keys as the mapper""" + from .implementations.dirfs import DirFileSystem + + return DirFileSystem(path=self._root_key_to_str, fs=self.fs) + + def clear(self): + """Remove all keys below root - empties out mapping""" + logger.info("Clear mapping at %s", self.root) + try: + self.fs.rm(self.root, True) + self.fs.mkdir(self.root) + except: # noqa: E722 + pass + + def getitems(self, keys, on_error="raise"): + """Fetch multiple items from the store + + If the backend is async-able, this might proceed concurrently + + Parameters + ---------- + keys: list(str) + They keys to be fetched + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + + Returns + ------- + dict(key, bytes|exception) + """ + keys2 = [self._key_to_str(k) for k in keys] + oe = on_error if on_error == "raise" else "return" + try: + out = self.fs.cat(keys2, on_error=oe) + if isinstance(out, bytes): + out = {keys2[0]: out} + except self.missing_exceptions as e: + raise KeyError from e + out = { + k: (KeyError() if isinstance(v, self.missing_exceptions) else v) + for k, v in out.items() + } + return { + key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2)) + for key, k2 in zip(keys, keys2) + if on_error == "return" or not isinstance(out[k2], BaseException) + } + + def setitems(self, values_dict): + """Set the values of multiple items in the store + + Parameters + ---------- + values_dict: dict(str, bytes) + """ + values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()} + self.fs.pipe(values) + + def delitems(self, keys): + """Remove multiple keys from the store""" + self.fs.rm([self._key_to_str(k) for k in keys]) + + def _key_to_str(self, key): + """Generate full path for the key""" + if not isinstance(key, str): + # raise TypeError("key must be of type `str`, got `{type(key).__name__}`" + warnings.warn( + "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError", + DeprecationWarning, + ) + if isinstance(key, list): + key = tuple(key) + key = str(key) + return f"{self._root_key_to_str}{key}".rstrip("/") + + def _str_to_key(self, s): + """Strip path of to leave key name""" + return s[len(self.root) :].lstrip("/") + + def __getitem__(self, key, default=None): + """Retrieve data""" + k = self._key_to_str(key) + try: + result = self.fs.cat(k) + except self.missing_exceptions as exc: + if default is not None: + return default + raise KeyError(key) from exc + return result + + def pop(self, key, default=None): + """Pop data""" + result = self.__getitem__(key, default) + try: + del self[key] + except KeyError: + pass + return result + + def __setitem__(self, key, value): + """Store value in key""" + key = self._key_to_str(key) + self.fs.mkdirs(self.fs._parent(key), exist_ok=True) + self.fs.pipe_file(key, maybe_convert(value)) + + def __iter__(self): + return (self._str_to_key(x) for x in self.fs.find(self.root)) + + def __len__(self): + return len(self.fs.find(self.root)) + + def __delitem__(self, key): + """Remove key""" + try: + self.fs.rm(self._key_to_str(key)) + except Exception as exc: + raise KeyError from exc + + def __contains__(self, key): + """Does key exist in mapping?""" + path = self._key_to_str(key) + return self.fs.isfile(path) + + def __reduce__(self): + return FSMap, (self.root, self.fs, False, False, self.missing_exceptions) + + +def maybe_convert(value): + if isinstance(value, array.array) or hasattr(value, "__array__"): + # bytes-like things + if hasattr(value, "dtype") and value.dtype.kind in "Mm": + # The buffer interface doesn't support datetime64/timdelta64 numpy + # arrays + value = value.view("int64") + value = bytes(memoryview(value)) + return value + + +def get_mapper( + url="", + check=False, + create=False, + missing_exceptions=None, + alternate_root=None, + **kwargs, +): + """Create key-value interface for given URL and options + + The URL will be of the form "protocol://location" and point to the root + of the mapper required. All keys will be file-names below this location, + and their values the contents of each key. + + Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``. + + Parameters + ---------- + url: str + Root URL of mapping + check: bool + Whether to attempt to read from the location before instantiation, to + check that the mapping does exist + create: bool + Whether to make the directory corresponding to the root before + instantiating + missing_exceptions: None or tuple + If given, these exception types will be regarded as missing keys and + return KeyError when trying to read data. By default, you get + (FileNotFoundError, IsADirectoryError, NotADirectoryError) + alternate_root: None or str + In cases of complex URLs, the parser may fail to pick the correct part + for the mapper root, so this arg can override + + Returns + ------- + ``FSMap`` instance, the dict-like key-value store. + """ + # Removing protocol here - could defer to each open() on the backend + fs, urlpath = url_to_fs(url, **kwargs) + root = alternate_root if alternate_root is not None else urlpath + return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions) diff --git a/env/lib/python3.13/site-packages/fsspec/parquet.py b/env/lib/python3.13/site-packages/fsspec/parquet.py new file mode 100644 index 0000000000000000000000000000000000000000..0bca67fde330f94400ace9b08d503b53bbf8087f --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/parquet.py @@ -0,0 +1,580 @@ +import io +import json +import warnings +from typing import Literal + +import fsspec + +from .core import url_to_fs +from .spec import AbstractBufferedFile +from .utils import merge_offset_ranges + +# Parquet-Specific Utilities for fsspec +# +# Most of the functions defined in this module are NOT +# intended for public consumption. The only exception +# to this is `open_parquet_file`, which should be used +# place of `fs.open()` to open parquet-formatted files +# on remote file systems. + + +class AlreadyBufferedFile(AbstractBufferedFile): + def _fetch_range(self, start, end): + raise NotImplementedError + + +def open_parquet_files( + path: list[str], + mode: Literal["rb"] = "rb", + fs: None | fsspec.AbstractFileSystem = None, + metadata=None, + columns: None | list[str] = None, + row_groups: None | list[int] = None, + storage_options: None | dict = None, + engine: str = "auto", + max_gap: int = 64_000, + max_block: int = 256_000_000, + footer_sample_size: int = 1_000_000, + filters: None | list[list[list[str]]] = None, + **kwargs, +): + """ + Return a file-like object for a single Parquet file. + + The specified parquet `engine` will be used to parse the + footer metadata, and determine the required byte ranges + from the file. The target path will then be opened with + the "parts" (`KnownPartsOfAFile`) caching strategy. + + Note that this method is intended for usage with remote + file systems, and is unlikely to improve parquet-read + performance on local file systems. + + Parameters + ---------- + path: str + Target file path. + mode: str, optional + Mode option to be passed through to `fs.open`. Default is "rb". + metadata: Any, optional + Parquet metadata object. Object type must be supported + by the backend parquet engine. For now, only the "fastparquet" + engine supports an explicit `ParquetFile` metadata object. + If a metadata object is supplied, the remote footer metadata + will not need to be transferred into local memory. + fs: AbstractFileSystem, optional + Filesystem object to use for opening the file. If nothing is + specified, an `AbstractFileSystem` object will be inferred. + engine : str, default "auto" + Parquet engine to use for metadata parsing. Allowed options + include "fastparquet", "pyarrow", and "auto". The specified + engine must be installed in the current environment. If + "auto" is specified, and both engines are installed, + "fastparquet" will take precedence over "pyarrow". + columns: list, optional + List of all column names that may be read from the file. + row_groups : list, optional + List of all row-groups that may be read from the file. This + may be a list of row-group indices (integers), or it may be + a list of `RowGroup` metadata objects (if the "fastparquet" + engine is used). + storage_options : dict, optional + Used to generate an `AbstractFileSystem` object if `fs` was + not specified. + max_gap : int, optional + Neighboring byte ranges will only be merged when their + inter-range gap is <= `max_gap`. Default is 64KB. + max_block : int, optional + Neighboring byte ranges will only be merged when the size of + the aggregated range is <= `max_block`. Default is 256MB. + footer_sample_size : int, optional + Number of bytes to read from the end of the path to look + for the footer metadata. If the sampled bytes do not contain + the footer, a second read request will be required, and + performance will suffer. Default is 1MB. + filters : list[list], optional + List of filters to apply to prevent reading row groups, of the + same format as accepted by the loading engines. Ignored if + ``row_groups`` is specified. + **kwargs : + Optional key-word arguments to pass to `fs.open` + """ + + # Make sure we have an `AbstractFileSystem` object + # to work with + if fs is None: + path0 = path + if isinstance(path, (list, tuple)): + path = path[0] + fs, path = url_to_fs(path, **(storage_options or {})) + else: + path0 = path + + # For now, `columns == []` not supported, is the same + # as all columns + if columns is not None and len(columns) == 0: + columns = None + + # Set the engine + engine = _set_engine(engine) + + if isinstance(path0, (list, tuple)): + paths = path0 + elif "*" in path: + paths = fs.glob(path) + elif path0.endswith("/"): # or fs.isdir(path): + paths = [ + _ + for _ in fs.find(path, withdirs=False, detail=False) + if _.endswith((".parquet", ".parq")) + ] + else: + paths = [path] + + data = _get_parquet_byte_ranges( + paths, + fs, + metadata=metadata, + columns=columns, + row_groups=row_groups, + engine=engine, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + filters=filters, + ) + + # Call self.open with "parts" caching + options = kwargs.pop("cache_options", {}).copy() + return [ + AlreadyBufferedFile( + fs=None, + path=fn, + mode=mode, + cache_type="parts", + cache_options={ + **options, + "data": data.get(fn, {}), + }, + size=max(_[1] for _ in data.get(fn, {})), + **kwargs, + ) + for fn in data + ] + + +def open_parquet_file(*args, **kwargs): + """Create files tailed to reading specific parts of parquet files + + Please see ``open_parquet_files`` for details of the arguments. The + difference is, this function always returns a single ``AleadyBufferedFile``, + whereas `open_parquet_files`` always returns a list of files, even if + there are one or zero matching parquet files. + """ + return open_parquet_files(*args, **kwargs)[0] + + +def _get_parquet_byte_ranges( + paths, + fs, + metadata=None, + columns=None, + row_groups=None, + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, + engine="auto", + filters=None, +): + """Get a dictionary of the known byte ranges needed + to read a specific column/row-group selection from a + Parquet dataset. Each value in the output dictionary + is intended for use as the `data` argument for the + `KnownPartsOfAFile` caching strategy of a single path. + """ + + # Set engine if necessary + if isinstance(engine, str): + engine = _set_engine(engine) + + # Pass to specialized function if metadata is defined + if metadata is not None: + # Use the provided parquet metadata object + # to avoid transferring/parsing footer metadata + return _get_parquet_byte_ranges_from_metadata( + metadata, + fs, + engine, + columns=columns, + row_groups=row_groups, + max_gap=max_gap, + max_block=max_block, + filters=filters, + ) + + # Get file sizes asynchronously + file_sizes = fs.sizes(paths) + + # Populate global paths, starts, & ends + result = {} + data_paths = [] + data_starts = [] + data_ends = [] + add_header_magic = True + if columns is None and row_groups is None and filters is None: + # We are NOT selecting specific columns or row-groups. + # + # We can avoid sampling the footers, and just transfer + # all file data with cat_ranges + for i, path in enumerate(paths): + result[path] = {} + data_paths.append(path) + data_starts.append(0) + data_ends.append(file_sizes[i]) + add_header_magic = False # "Magic" should already be included + else: + # We ARE selecting specific columns or row-groups. + # + # Gather file footers. + # We just take the last `footer_sample_size` bytes of each + # file (or the entire file if it is smaller than that) + footer_starts = [] + footer_ends = [] + for i, path in enumerate(paths): + footer_ends.append(file_sizes[i]) + sample_size = max(0, file_sizes[i] - footer_sample_size) + footer_starts.append(sample_size) + footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends) + + # Check our footer samples and re-sample if necessary. + missing_footer_starts = footer_starts.copy() + large_footer = 0 + for i, path in enumerate(paths): + footer_size = int.from_bytes(footer_samples[i][-8:-4], "little") + real_footer_start = file_sizes[i] - (footer_size + 8) + if real_footer_start < footer_starts[i]: + missing_footer_starts[i] = real_footer_start + large_footer = max(large_footer, (footer_size + 8)) + if large_footer: + warnings.warn( + f"Not enough data was used to sample the parquet footer. " + f"Try setting footer_sample_size >= {large_footer}." + ) + for i, block in enumerate( + fs.cat_ranges( + paths, + missing_footer_starts, + footer_starts, + ) + ): + footer_samples[i] = block + footer_samples[i] + footer_starts[i] = missing_footer_starts[i] + + # Calculate required byte ranges for each path + for i, path in enumerate(paths): + # Use "engine" to collect data byte ranges + path_data_starts, path_data_ends = engine._parquet_byte_ranges( + columns, + row_groups=row_groups, + footer=footer_samples[i], + footer_start=footer_starts[i], + filters=filters, + ) + + data_paths += [path] * len(path_data_starts) + data_starts += path_data_starts + data_ends += path_data_ends + result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = ( + footer_samples[i] + ) + + # Merge adjacent offset ranges + data_paths, data_starts, data_ends = merge_offset_ranges( + data_paths, + data_starts, + data_ends, + max_gap=max_gap, + max_block=max_block, + sort=False, # Should already be sorted + ) + + # Start by populating `result` with footer samples + for i, path in enumerate(paths): + result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]} + + # Transfer the data byte-ranges into local memory + _transfer_ranges(fs, result, data_paths, data_starts, data_ends) + + # Add b"PAR1" to header if necessary + if add_header_magic: + _add_header_magic(result) + + return result + + +def _get_parquet_byte_ranges_from_metadata( + metadata, + fs, + engine, + columns=None, + row_groups=None, + max_gap=64_000, + max_block=256_000_000, + filters=None, +): + """Simplified version of `_get_parquet_byte_ranges` for + the case that an engine-specific `metadata` object is + provided, and the remote footer metadata does not need to + be transferred before calculating the required byte ranges. + """ + + # Use "engine" to collect data byte ranges + data_paths, data_starts, data_ends = engine._parquet_byte_ranges( + columns, row_groups=row_groups, metadata=metadata, filters=filters + ) + + # Merge adjacent offset ranges + data_paths, data_starts, data_ends = merge_offset_ranges( + data_paths, + data_starts, + data_ends, + max_gap=max_gap, + max_block=max_block, + sort=False, # Should be sorted + ) + + # Transfer the data byte-ranges into local memory + result = {fn: {} for fn in list(set(data_paths))} + _transfer_ranges(fs, result, data_paths, data_starts, data_ends) + + # Add b"PAR1" to header + _add_header_magic(result) + + return result + + +def _transfer_ranges(fs, blocks, paths, starts, ends): + # Use cat_ranges to gather the data byte_ranges + ranges = (paths, starts, ends) + for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)): + blocks[path][(start, stop)] = data + + +def _add_header_magic(data): + # Add b"PAR1" to file headers + for path in list(data.keys()): + add_magic = True + for k in data[path]: + if k[0] == 0 and k[1] >= 4: + add_magic = False + break + if add_magic: + data[path][(0, 4)] = b"PAR1" + + +def _set_engine(engine_str): + # Define a list of parquet engines to try + if engine_str == "auto": + try_engines = ("fastparquet", "pyarrow") + elif not isinstance(engine_str, str): + raise ValueError( + "Failed to set parquet engine! " + "Please pass 'fastparquet', 'pyarrow', or 'auto'" + ) + elif engine_str not in ("fastparquet", "pyarrow"): + raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`") + else: + try_engines = [engine_str] + + # Try importing the engines in `try_engines`, + # and choose the first one that succeeds + for engine in try_engines: + try: + if engine == "fastparquet": + return FastparquetEngine() + elif engine == "pyarrow": + return PyarrowEngine() + except ImportError: + pass + + # Raise an error if a supported parquet engine + # was not found + raise ImportError( + f"The following parquet engines are not installed " + f"in your python environment: {try_engines}." + f"Please install 'fastparquert' or 'pyarrow' to " + f"utilize the `fsspec.parquet` module." + ) + + +class FastparquetEngine: + # The purpose of the FastparquetEngine class is + # to check if fastparquet can be imported (on initialization) + # and to define a `_parquet_byte_ranges` method. In the + # future, this class may also be used to define other + # methods/logic that are specific to fastparquet. + + def __init__(self): + import fastparquet as fp + + self.fp = fp + + def _row_group_filename(self, row_group, pf): + return pf.row_group_filename(row_group) + + def _parquet_byte_ranges( + self, + columns, + row_groups=None, + metadata=None, + footer=None, + footer_start=None, + filters=None, + ): + # Initialize offset ranges and define ParqetFile metadata + pf = metadata + data_paths, data_starts, data_ends = [], [], [] + if filters and row_groups: + raise ValueError("filters and row_groups cannot be used together") + if pf is None: + pf = self.fp.ParquetFile(io.BytesIO(footer)) + + # Convert columns to a set and add any index columns + # specified in the pandas metadata (just in case) + column_set = None if columns is None else {c.split(".", 1)[0] for c in columns} + if column_set is not None and hasattr(pf, "pandas_metadata"): + md_index = [ + ind + for ind in pf.pandas_metadata.get("index_columns", []) + # Ignore RangeIndex information + if not isinstance(ind, dict) + ] + column_set |= set(md_index) + + # Check if row_groups is a list of integers + # or a list of row-group metadata + if filters: + from fastparquet.api import filter_row_groups + + row_group_indices = None + row_groups = filter_row_groups(pf, filters) + elif row_groups and not isinstance(row_groups[0], int): + # Input row_groups contains row-group metadata + row_group_indices = None + else: + # Input row_groups contains row-group indices + row_group_indices = row_groups + row_groups = pf.row_groups + + # Loop through column chunks to add required byte ranges + for r, row_group in enumerate(row_groups): + # Skip this row-group if we are targeting + # specific row-groups + if row_group_indices is None or r in row_group_indices: + # Find the target parquet-file path for `row_group` + fn = self._row_group_filename(row_group, pf) + + for column in row_group.columns: + name = column.meta_data.path_in_schema[0] + # Skip this column if we are targeting a + # specific columns + if column_set is None or name in column_set: + file_offset0 = column.meta_data.dictionary_page_offset + if file_offset0 is None: + file_offset0 = column.meta_data.data_page_offset + num_bytes = column.meta_data.total_compressed_size + if footer_start is None or file_offset0 < footer_start: + data_paths.append(fn) + data_starts.append(file_offset0) + data_ends.append( + min( + file_offset0 + num_bytes, + footer_start or (file_offset0 + num_bytes), + ) + ) + + if metadata: + # The metadata in this call may map to multiple + # file paths. Need to include `data_paths` + return data_paths, data_starts, data_ends + return data_starts, data_ends + + +class PyarrowEngine: + # The purpose of the PyarrowEngine class is + # to check if pyarrow can be imported (on initialization) + # and to define a `_parquet_byte_ranges` method. In the + # future, this class may also be used to define other + # methods/logic that are specific to pyarrow. + + def __init__(self): + import pyarrow.parquet as pq + + self.pq = pq + + def _row_group_filename(self, row_group, metadata): + raise NotImplementedError + + def _parquet_byte_ranges( + self, + columns, + row_groups=None, + metadata=None, + footer=None, + footer_start=None, + filters=None, + ): + if metadata is not None: + raise ValueError("metadata input not supported for PyarrowEngine") + if filters: + raise NotImplementedError + + data_starts, data_ends = [], [] + md = self.pq.ParquetFile(io.BytesIO(footer)).metadata + + # Convert columns to a set and add any index columns + # specified in the pandas metadata (just in case) + column_set = None if columns is None else set(columns) + if column_set is not None: + schema = md.schema.to_arrow_schema() + has_pandas_metadata = ( + schema.metadata is not None and b"pandas" in schema.metadata + ) + if has_pandas_metadata: + md_index = [ + ind + for ind in json.loads( + schema.metadata[b"pandas"].decode("utf8") + ).get("index_columns", []) + # Ignore RangeIndex information + if not isinstance(ind, dict) + ] + column_set |= set(md_index) + + # Loop through column chunks to add required byte ranges + for r in range(md.num_row_groups): + # Skip this row-group if we are targeting + # specific row-groups + if row_groups is None or r in row_groups: + row_group = md.row_group(r) + for c in range(row_group.num_columns): + column = row_group.column(c) + name = column.path_in_schema + # Skip this column if we are targeting a + # specific columns + split_name = name.split(".")[0] + if ( + column_set is None + or name in column_set + or split_name in column_set + ): + file_offset0 = column.dictionary_page_offset + if file_offset0 is None: + file_offset0 = column.data_page_offset + num_bytes = column.total_compressed_size + if file_offset0 < footer_start: + data_starts.append(file_offset0) + data_ends.append( + min(file_offset0 + num_bytes, footer_start) + ) + return data_starts, data_ends diff --git a/env/lib/python3.13/site-packages/fsspec/registry.py b/env/lib/python3.13/site-packages/fsspec/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..9c50ef02ad35395be85cf50ccb0db32e0f7aeb1f --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/registry.py @@ -0,0 +1,333 @@ +from __future__ import annotations + +import importlib +import types +import warnings + +__all__ = ["registry", "get_filesystem_class", "default"] + +# internal, mutable +_registry: dict[str, type] = {} + +# external, immutable +registry = types.MappingProxyType(_registry) +default = "file" + + +def register_implementation(name, cls, clobber=False, errtxt=None): + """Add implementation class to the registry + + Parameters + ---------- + name: str + Protocol name to associate with the class + cls: class or str + if a class: fsspec-compliant implementation class (normally inherits from + ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a + str, the full path to an implementation class like package.module.class, + which gets added to known_implementations, + so the import is deferred until the filesystem is actually used. + clobber: bool (optional) + Whether to overwrite a protocol with the same name; if False, will raise + instead. + errtxt: str (optional) + If given, then a failure to import the given class will result in this + text being given. + """ + if isinstance(cls, str): + if name in known_implementations and clobber is False: + if cls != known_implementations[name]["class"]: + raise ValueError( + f"Name ({name}) already in the known_implementations and clobber " + f"is False" + ) + else: + known_implementations[name] = { + "class": cls, + "err": errtxt or f"{cls} import failed for protocol {name}", + } + + else: + if name in registry and clobber is False: + if _registry[name] is not cls: + raise ValueError( + f"Name ({name}) already in the registry and clobber is False" + ) + else: + _registry[name] = cls + + +# protocols mapped to the class which implements them. This dict can be +# updated with register_implementation +known_implementations = { + "abfs": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, + "adl": { + "class": "adlfs.AzureDatalakeFileSystem", + "err": "Install adlfs to access Azure Datalake Gen1", + }, + "arrow_hdfs": { + "class": "fsspec.implementations.arrow.HadoopFileSystem", + "err": "pyarrow and local java libraries required for HDFS", + }, + "async_wrapper": { + "class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper", + }, + "asynclocal": { + "class": "morefs.asyn_local.AsyncLocalFileSystem", + "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem", + }, + "asyncwrapper": { + "class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper", + }, + "az": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, + "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, + "box": { + "class": "boxfs.BoxFileSystem", + "err": "Please install boxfs to access BoxFileSystem", + }, + "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, + "dask": { + "class": "fsspec.implementations.dask.DaskWorkerFileSystem", + "err": "Install dask distributed to access worker file system", + }, + "data": {"class": "fsspec.implementations.data.DataFileSystem"}, + "dbfs": { + "class": "fsspec.implementations.dbfs.DatabricksFileSystem", + "err": "Install the requests package to use the DatabricksFileSystem", + }, + "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"}, + "dropbox": { + "class": "dropboxdrivefs.DropboxDriveFileSystem", + "err": ( + 'DropboxFileSystem requires "dropboxdrivefs","requests" and "' + '"dropbox" to be installed' + ), + }, + "dvc": { + "class": "dvc.api.DVCFileSystem", + "err": "Install dvc to access DVCFileSystem", + }, + "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, + "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, + "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, + "gcs": { + "class": "gcsfs.GCSFileSystem", + "err": "Please install gcsfs to access Google Storage", + }, + "gdrive": { + "class": "gdrive_fsspec.GoogleDriveFileSystem", + "err": "Please install gdrive_fs for access to Google Drive", + }, + "generic": {"class": "fsspec.generic.GenericFileSystem"}, + "gist": { + "class": "fsspec.implementations.gist.GistFileSystem", + "err": "Install the requests package to use the gist FS", + }, + "git": { + "class": "fsspec.implementations.git.GitFileSystem", + "err": "Install pygit2 to browse local git repos", + }, + "github": { + "class": "fsspec.implementations.github.GithubFileSystem", + "err": "Install the requests package to use the github FS", + }, + "gs": { + "class": "gcsfs.GCSFileSystem", + "err": "Please install gcsfs to access Google Storage", + }, + "hdfs": { + "class": "fsspec.implementations.arrow.HadoopFileSystem", + "err": "pyarrow and local java libraries required for HDFS", + }, + "hf": { + "class": "huggingface_hub.HfFileSystem", + "err": "Install huggingface_hub to access HfFileSystem", + }, + "http": { + "class": "fsspec.implementations.http.HTTPFileSystem", + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', + }, + "https": { + "class": "fsspec.implementations.http.HTTPFileSystem", + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', + }, + "jlab": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, + "jupyter": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, + "lakefs": { + "class": "lakefs_spec.LakeFSFileSystem", + "err": "Please install lakefs-spec to access LakeFSFileSystem", + }, + "libarchive": { + "class": "fsspec.implementations.libarchive.LibArchiveFileSystem", + "err": "LibArchive requires to be installed", + }, + "local": {"class": "fsspec.implementations.local.LocalFileSystem"}, + "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, + "oci": { + "class": "ocifs.OCIFileSystem", + "err": "Install ocifs to access OCI Object Storage", + }, + "ocilake": { + "class": "ocifs.OCIFileSystem", + "err": "Install ocifs to access OCI Data Lake", + }, + "oss": { + "class": "ossfs.OSSFileSystem", + "err": "Install ossfs to access Alibaba Object Storage System", + }, + "pyscript": { + "class": "pyscript_fsspec_client.client.PyscriptFileSystem", + "err": "Install requests (cpython) or run in pyscript", + }, + "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"}, + "root": { + "class": "fsspec_xrootd.XRootDFileSystem", + "err": ( + "Install fsspec-xrootd to access xrootd storage system. " + "Note: 'root' is the protocol name for xrootd storage systems, " + "not referring to root directories" + ), + }, + "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, + "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, + "sftp": { + "class": "fsspec.implementations.sftp.SFTPFileSystem", + "err": 'SFTPFileSystem requires "paramiko" to be installed', + }, + "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"}, + "smb": { + "class": "fsspec.implementations.smb.SMBFileSystem", + "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed', + }, + "ssh": { + "class": "fsspec.implementations.sftp.SFTPFileSystem", + "err": 'SFTPFileSystem requires "paramiko" to be installed', + }, + "tar": {"class": "fsspec.implementations.tar.TarFileSystem"}, + "tos": { + "class": "tosfs.TosFileSystem", + "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage", + }, + "tosfs": { + "class": "tosfs.TosFileSystem", + "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage", + }, + "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"}, + "webdav": { + "class": "webdav4.fsspec.WebdavFileSystem", + "err": "Install webdav4 to access WebDAV", + }, + "webhdfs": { + "class": "fsspec.implementations.webhdfs.WebHDFS", + "err": 'webHDFS access requires "requests" to be installed', + }, + "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, +} + +assert list(known_implementations) == sorted(known_implementations), ( + "Not in alphabetical order" +) + + +def get_filesystem_class(protocol): + """Fetch named protocol implementation from the registry + + The dict ``known_implementations`` maps protocol names to the locations + of classes implementing the corresponding file-system. When used for the + first time, appropriate imports will happen and the class will be placed in + the registry. All subsequent calls will fetch directly from the registry. + + Some protocol implementations require additional dependencies, and so the + import may fail. In this case, the string in the "err" field of the + ``known_implementations`` will be given as the error message. + """ + if not protocol: + protocol = default + + if protocol not in registry: + if protocol not in known_implementations: + raise ValueError(f"Protocol not known: {protocol}") + bit = known_implementations[protocol] + try: + register_implementation(protocol, _import_class(bit["class"])) + except ImportError as e: + raise ImportError(bit.get("err")) from e + cls = registry[protocol] + if getattr(cls, "protocol", None) in ("abstract", None): + cls.protocol = protocol + + return cls + + +s3_msg = """Your installed version of s3fs is very old and known to cause +severe performance issues, see also https://github.com/dask/dask/issues/10276 + +To fix, you should specify a lower version bound on s3fs, or +update the current installation. +""" + + +def _import_class(fqp: str): + """Take a fully-qualified path and return the imported class or identifier. + + ``fqp`` is of the form "package.module.klass" or + "package.module:subobject.klass". + + Warnings + -------- + This can import arbitrary modules. Make sure you haven't installed any modules + that may execute malicious code at import time. + """ + if ":" in fqp: + mod, name = fqp.rsplit(":", 1) + else: + mod, name = fqp.rsplit(".", 1) + + is_s3 = mod == "s3fs" + mod = importlib.import_module(mod) + if is_s3 and mod.__version__.split(".") < ["0", "5"]: + warnings.warn(s3_msg) + for part in name.split("."): + mod = getattr(mod, part) + + if not isinstance(mod, type): + raise TypeError(f"{fqp} is not a class") + + return mod + + +def filesystem(protocol, **storage_options): + """Instantiate filesystems for given protocol and arguments + + ``storage_options`` are specific to the protocol being chosen, and are + passed directly to the class. + """ + if protocol == "arrow_hdfs": + warnings.warn( + "The 'arrow_hdfs' protocol has been deprecated and will be " + "removed in the future. Specify it as 'hdfs'.", + DeprecationWarning, + ) + + cls = get_filesystem_class(protocol) + return cls(**storage_options) + + +def available_protocols(): + """Return a list of the implemented protocols. + + Note that any given protocol may require extra packages to be importable. + """ + return list(known_implementations) diff --git a/env/lib/python3.13/site-packages/fsspec/spec.py b/env/lib/python3.13/site-packages/fsspec/spec.py new file mode 100644 index 0000000000000000000000000000000000000000..b67d5c16fcdf09ce6f9e1354727196042cde3c4c --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/spec.py @@ -0,0 +1,2281 @@ +from __future__ import annotations + +import io +import json +import logging +import os +import threading +import warnings +import weakref +from errno import ESPIPE +from glob import has_magic +from hashlib import sha256 +from typing import Any, ClassVar + +from .callbacks import DEFAULT_CALLBACK +from .config import apply_config, conf +from .dircache import DirCache +from .transaction import Transaction +from .utils import ( + _unstrip_protocol, + glob_translate, + isfilelike, + other_paths, + read_block, + stringify_path, + tokenize, +) + +logger = logging.getLogger("fsspec") + + +def make_instance(cls, args, kwargs): + return cls(*args, **kwargs) + + +class _Cached(type): + """ + Metaclass for caching file system instances. + + Notes + ----- + Instances are cached according to + + * The values of the class attributes listed in `_extra_tokenize_attributes` + * The arguments passed to ``__init__``. + + This creates an additional reference to the filesystem, which prevents the + filesystem from being garbage collected when all *user* references go away. + A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* + be made for a filesystem instance to be garbage collected. + """ + + def __init__(cls, *args, **kwargs): + super().__init__(*args, **kwargs) + # Note: we intentionally create a reference here, to avoid garbage + # collecting instances when all other references are gone. To really + # delete a FileSystem, the cache must be cleared. + if conf.get("weakref_instance_cache"): # pragma: no cover + # debug option for analysing fork/spawn conditions + cls._cache = weakref.WeakValueDictionary() + else: + cls._cache = {} + cls._pid = os.getpid() + + def __call__(cls, *args, **kwargs): + kwargs = apply_config(cls, kwargs) + extra_tokens = tuple( + getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes + ) + strip_tokenize_options = { + k: kwargs.pop(k) for k in cls._strip_tokenize_options if k in kwargs + } + token = tokenize( + cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs + ) + skip = kwargs.pop("skip_instance_cache", False) + if os.getpid() != cls._pid: + cls._cache.clear() + cls._pid = os.getpid() + if not skip and cls.cachable and token in cls._cache: + cls._latest = token + return cls._cache[token] + else: + obj = super().__call__(*args, **kwargs, **strip_tokenize_options) + # Setting _fs_token here causes some static linters to complain. + obj._fs_token_ = token + obj.storage_args = args + obj.storage_options = kwargs + if obj.async_impl and obj.mirror_sync_methods: + from .asyn import mirror_sync_methods + + mirror_sync_methods(obj) + + if cls.cachable and not skip: + cls._latest = token + cls._cache[token] = obj + return obj + + +class AbstractFileSystem(metaclass=_Cached): + """ + An abstract super-class for pythonic file-systems + + Implementations are expected to be compatible with or, better, subclass + from here. + """ + + cachable = True # this class can be cached, instances reused + _cached = False + blocksize = 2**22 + sep = "/" + protocol: ClassVar[str | tuple[str, ...]] = "abstract" + _latest = None + async_impl = False + mirror_sync_methods = False + root_marker = "" # For some FSs, may require leading '/' or other character + transaction_type = Transaction + + #: Extra *class attributes* that should be considered when hashing. + _extra_tokenize_attributes = () + #: *storage options* that should not be considered when hashing. + _strip_tokenize_options = () + + # Set by _Cached metaclass + storage_args: tuple[Any, ...] + storage_options: dict[str, Any] + + def __init__(self, *args, **storage_options): + """Create and configure file-system instance + + Instances may be cachable, so if similar enough arguments are seen + a new instance is not required. The token attribute exists to allow + implementations to cache instances if they wish. + + A reasonable default should be provided if there are no arguments. + + Subclasses should call this method. + + Parameters + ---------- + use_listings_cache, listings_expiry_time, max_paths: + passed to ``DirCache``, if the implementation supports + directory listing caching. Pass use_listings_cache=False + to disable such caching. + skip_instance_cache: bool + If this is a cachable implementation, pass True here to force + creating a new instance even if a matching instance exists, and prevent + storing this instance. + asynchronous: bool + loop: asyncio-compatible IOLoop or None + """ + if self._cached: + # reusing instance, don't change + return + self._cached = True + self._intrans = False + self._transaction = None + self._invalidated_caches_in_transaction = [] + self.dircache = DirCache(**storage_options) + + if storage_options.pop("add_docs", None): + warnings.warn("add_docs is no longer supported.", FutureWarning) + + if storage_options.pop("add_aliases", None): + warnings.warn("add_aliases has been removed.", FutureWarning) + # This is set in _Cached + self._fs_token_ = None + + @property + def fsid(self): + """Persistent filesystem id that can be used to compare filesystems + across sessions. + """ + raise NotImplementedError + + @property + def _fs_token(self): + return self._fs_token_ + + def __dask_tokenize__(self): + return self._fs_token + + def __hash__(self): + return int(self._fs_token, 16) + + def __eq__(self, other): + return isinstance(other, type(self)) and self._fs_token == other._fs_token + + def __reduce__(self): + return make_instance, (type(self), self.storage_args, self.storage_options) + + @classmethod + def _strip_protocol(cls, path): + """Turn path from fully-qualified to file-system-specific + + May require FS-specific handling, e.g., for relative paths or links. + """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path = stringify_path(path) + protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol + for protocol in protos: + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.rstrip("/") + # use of root_marker to make minimum required path, e.g., "/" + return path or cls.root_marker + + def unstrip_protocol(self, name: str) -> str: + """Format FS-specific path to generic, including protocol""" + protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol + for protocol in protos: + if name.startswith(f"{protocol}://"): + return name + return f"{protos[0]}://{name}" + + @staticmethod + def _get_kwargs_from_urls(path): + """If kwargs can be encoded in the paths, extract them here + + This should happen before instantiation of the class; incoming paths + then should be amended to strip the options in methods. + + Examples may look like an sftp path "sftp://user@host:/my/path", where + the user and host should become kwargs and later get stripped. + """ + # by default, nothing happens + return {} + + @classmethod + def current(cls): + """Return the most recently instantiated FileSystem + + If no instance has been created, then create one with defaults + """ + if cls._latest in cls._cache: + return cls._cache[cls._latest] + return cls() + + @property + def transaction(self): + """A context within which files are committed together upon exit + + Requires the file class to implement `.commit()` and `.discard()` + for the normal and exception cases. + """ + if self._transaction is None: + self._transaction = self.transaction_type(self) + return self._transaction + + def start_transaction(self): + """Begin write transaction for deferring files, non-context version""" + self._intrans = True + self._transaction = self.transaction_type(self) + return self.transaction + + def end_transaction(self): + """Finish write transaction, non-context version""" + self.transaction.complete() + self._transaction = None + # The invalid cache must be cleared after the transaction is completed. + for path in self._invalidated_caches_in_transaction: + self.invalidate_cache(path) + self._invalidated_caches_in_transaction.clear() + + def invalidate_cache(self, path=None): + """ + Discard any cached directory information + + Parameters + ---------- + path: string or None + If None, clear all listings cached else listings at or under given + path. + """ + # Not necessary to implement invalidation mechanism, may have no cache. + # But if have, you should call this method of parent class from your + # subclass to ensure expiring caches after transacations correctly. + # See the implementation of FTPFileSystem in ftp.py + if self._intrans: + self._invalidated_caches_in_transaction.append(path) + + def mkdir(self, path, create_parents=True, **kwargs): + """ + Create directory entry at path + + For systems that don't have true directories, may create an for + this instance only and not touch the real filesystem + + Parameters + ---------- + path: str + location + create_parents: bool + if True, this is equivalent to ``makedirs`` + kwargs: + may be permissions, etc. + """ + pass # not necessary to implement, may not have directories + + def makedirs(self, path, exist_ok=False): + """Recursively make directories + + Creates directory at path and any intervening required directories. + Raises exception if, for instance, the path already exists but is a + file. + + Parameters + ---------- + path: str + leaf directory name + exist_ok: bool (False) + If False, will error if the target already exists + """ + pass # not necessary to implement, may not have directories + + def rmdir(self, path): + """Remove a directory, if empty""" + pass # not necessary to implement, may not have directories + + def ls(self, path, detail=True, **kwargs): + """List objects at path. + + This should include subdirectories and files at that location. The + difference between a file and a directory must be clear when details + are requested. + + The specific keys, or perhaps a FileInfo class, or similar, is TBD, + but must be consistent across implementations. + Must include: + + - full path to the entry (without protocol) + - size of the entry, in bytes. If the value cannot be determined, will + be ``None``. + - type of entry, "file", "directory" or other + + Additional information + may be present, appropriate to the file-system, e.g., generation, + checksum, etc. + + May use refresh=True|False to allow use of self._ls_from_cache to + check for a saved listing and avoid calling the backend. This would be + common where listing may be expensive. + + Parameters + ---------- + path: str + detail: bool + if True, gives a list of dictionaries, where each is the same as + the result of ``info(path)``. If False, gives a list of paths + (str). + kwargs: may have additional backend-specific options, such as version + information + + Returns + ------- + List of strings if detail is False, or list of directory information + dicts if detail is True. + """ + raise NotImplementedError + + def _ls_from_cache(self, path): + """Check cache for listing + + Returns listing, if found (may be empty list for a directly that exists + but contains nothing), None if not in cache. + """ + parent = self._parent(path) + try: + return self.dircache[path.rstrip("/")] + except KeyError: + pass + try: + files = [ + f + for f in self.dircache[parent] + if f["name"] == path + or (f["name"] == path.rstrip("/") and f["type"] == "directory") + ] + if len(files) == 0: + # parent dir was listed but did not contain this file + raise FileNotFoundError(path) + return files + except KeyError: + pass + + def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs): + """Return all files under the given path. + + List all files, recursing into subdirectories; output is iterator-style, + like ``os.walk()``. For a simple list of files, ``find()`` is available. + + When topdown is True, the caller can modify the dirnames list in-place (perhaps + using del or slice assignment), and walk() will + only recurse into the subdirectories whose names remain in dirnames; + this can be used to prune the search, impose a specific order of visiting, + or even to inform walk() about directories the caller creates or renames before + it resumes walk() again. + Modifying dirnames when topdown is False has no effect. (see os.walk) + + Note that the "files" outputted will include anything that is not + a directory, such as links. + + Parameters + ---------- + path: str + Root to recurse into + maxdepth: int + Maximum recursion depth. None means limitless, but not recommended + on link-based file-systems. + topdown: bool (True) + Whether to walk the directory tree from the top downwards or from + the bottom upwards. + on_error: "omit", "raise", a callable + if omit (default), path with exception will simply be empty; + If raise, an underlying exception will be raised; + if callable, it will be called with a single OSError instance as argument + kwargs: passed to ``ls`` + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + path = self._strip_protocol(path) + full_dirs = {} + dirs = {} + files = {} + + detail = kwargs.pop("detail", False) + try: + listing = self.ls(path, detail=True, **kwargs) + except (FileNotFoundError, OSError) as e: + if on_error == "raise": + raise + if callable(on_error): + on_error(e) + return + + for info in listing: + # each info name must be at least [path]/part , but here + # we check also for names like [path]/part/ + pathname = info["name"].rstrip("/") + name = pathname.rsplit("/", 1)[-1] + if info["type"] == "directory" and pathname != path: + # do not include "self" path + full_dirs[name] = pathname + dirs[name] = info + elif pathname == path: + # file-like with same name as give path + files[""] = info + else: + files[name] = info + + if not detail: + dirs = list(dirs) + files = list(files) + + if topdown: + # Yield before recursion if walking top down + yield path, dirs, files + + if maxdepth is not None: + maxdepth -= 1 + if maxdepth < 1: + if not topdown: + yield path, dirs, files + return + + for d in dirs: + yield from self.walk( + full_dirs[d], + maxdepth=maxdepth, + detail=detail, + topdown=topdown, + **kwargs, + ) + + if not topdown: + # Yield after recursion if walking bottom up + yield path, dirs, files + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + """List all files below path. + + Like posix ``find`` command without conditions + + Parameters + ---------- + path : str + maxdepth: int or None + If not None, the maximum number of levels to descend + withdirs: bool + Whether to include directory paths in the output. This is True + when used by glob, but users usually only want files. + kwargs are passed to ``ls``. + """ + # TODO: allow equivalent of -name parameter + path = self._strip_protocol(path) + out = {} + + # Add the root directory if withdirs is requested + # This is needed for posix glob compliance + if withdirs and path != "" and self.isdir(path): + out[path] = self.info(path) + + for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs): + if withdirs: + files.update(dirs) + out.update({info["name"]: info for name, info in files.items()}) + if not out and self.isfile(path): + # walk works on directories, but find should also return [path] + # when path happens to be a file + out[path] = {} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} + + def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs): + """Space used by files and optionally directories within a path + + Directory size does not include the size of its contents. + + Parameters + ---------- + path: str + total: bool + Whether to sum all the file sizes + maxdepth: int or None + Maximum number of directory levels to descend, None for unlimited. + withdirs: bool + Whether to include directory paths in the output. + kwargs: passed to ``find`` + + Returns + ------- + Dict of {path: size} if total=False, or int otherwise, where numbers + refer to bytes used. + """ + sizes = {} + if withdirs and self.isdir(path): + # Include top-level directory in output + info = self.info(path) + sizes[info["name"]] = info["size"] + for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs): + info = self.info(f) + sizes[info["name"]] = info["size"] + if total: + return sum(sizes.values()) + else: + return sizes + + def glob(self, path, maxdepth=None, **kwargs): + """Find files by glob-matching. + + Pattern matching capabilities for finding files that match the given pattern. + + Parameters + ---------- + path: str + The glob pattern to match against + maxdepth: int or None + Maximum depth for ``'**'`` patterns. Applied on the first ``'**'`` found. + Must be at least 1 if provided. + kwargs: + Additional arguments passed to ``find`` (e.g., detail=True) + + Returns + ------- + List of matched paths, or dict of paths and their info if detail=True + + Notes + ----- + Supported patterns: + - '*': Matches any sequence of characters within a single directory level + - ``'**'``: Matches any number of directory levels (must be an entire path component) + - '?': Matches exactly one character + - '[abc]': Matches any character in the set + - '[a-z]': Matches any character in the range + - '[!abc]': Matches any character NOT in the set + + Special behaviors: + - If the path ends with '/', only folders are returned + - Consecutive '*' characters are compressed into a single '*' + - Empty brackets '[]' never match anything + - Negated empty brackets '[!]' match any single character + - Special characters in character classes are escaped properly + + Limitations: + - ``'**'`` must be a complete path component (e.g., ``'a/**/b'``, not ``'a**b'``) + - No brace expansion ('{a,b}.txt') + - No extended glob patterns ('+(pattern)', '!(pattern)') + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + import re + + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_qmark, idx_brace) + + detail = kwargs.pop("detail", False) + + if not has_magic(path): + if self.exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: self.info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) + pattern = re.compile(pattern) + + out = { + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + } + + if detail: + return out + else: + return list(out) + + def exists(self, path, **kwargs): + """Is there a file at the given path""" + try: + self.info(path, **kwargs) + return True + except: # noqa: E722 + # any exception allowed bar FileNotFoundError? + return False + + def lexists(self, path, **kwargs): + """If there is a file at the given path (including + broken links)""" + return self.exists(path) + + def info(self, path, **kwargs): + """Give details of entry at path + + Returns a single dictionary, with exactly the same information as ``ls`` + would with ``detail=True``. + + The default implementation calls ls and could be overridden by a + shortcut. kwargs are passed on to ```ls()``. + + Some file systems might not be able to measure the file's size, in + which case, the returned dict will include ``'size': None``. + + Returns + ------- + dict with keys: name (full path in the FS), size (in bytes), type (file, + directory, or something else) and other FS-specific keys. + """ + path = self._strip_protocol(path) + out = self.ls(self._parent(path), detail=True, **kwargs) + out = [o for o in out if o["name"].rstrip("/") == path] + if out: + return out[0] + out = self.ls(path, detail=True, **kwargs) + path = path.rstrip("/") + out1 = [o for o in out if o["name"].rstrip("/") == path] + if len(out1) == 1: + if "size" not in out1[0]: + out1[0]["size"] = None + return out1[0] + elif len(out1) > 1 or out: + return {"name": path, "size": 0, "type": "directory"} + else: + raise FileNotFoundError(path) + + def checksum(self, path): + """Unique value for current version of file + + If the checksum is the same from one moment to another, the contents + are guaranteed to be the same. If the checksum changes, the contents + *might* have changed. + + This should normally be overridden; default will probably capture + creation/modification timestamp (which would be good) or maybe + access timestamp (which would be bad) + """ + return int(tokenize(self.info(path)), 16) + + def size(self, path): + """Size in bytes of file""" + return self.info(path).get("size", None) + + def sizes(self, paths): + """Size in bytes of each file in a list of paths""" + return [self.size(p) for p in paths] + + def isdir(self, path): + """Is this entry directory-like?""" + try: + return self.info(path)["type"] == "directory" + except OSError: + return False + + def isfile(self, path): + """Is this entry file-like?""" + try: + return self.info(path)["type"] == "file" + except: # noqa: E722 + return False + + def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs): + """Get the contents of the file as a string. + + Parameters + ---------- + path: str + URL of file on this filesystems + encoding, errors, newline: same as `open`. + """ + with self.open( + path, + mode="r", + encoding=encoding, + errors=errors, + newline=newline, + **kwargs, + ) as f: + return f.read() + + def write_text( + self, path, value, encoding=None, errors=None, newline=None, **kwargs + ): + """Write the text to the given file. + + An existing file will be overwritten. + + Parameters + ---------- + path: str + URL of file on this filesystems + value: str + Text to write. + encoding, errors, newline: same as `open`. + """ + with self.open( + path, + mode="w", + encoding=encoding, + errors=errors, + newline=newline, + **kwargs, + ) as f: + return f.write(value) + + def cat_file(self, path, start=None, end=None, **kwargs): + """Get the content of a file + + Parameters + ---------- + path: URL of file on this filesystems + start, end: int + Bytes limits of the read. If negative, backwards from end, + like usual python slices. Either can be None for start or + end of file, respectively + kwargs: passed to ``open()``. + """ + # explicitly set buffering off? + with self.open(path, "rb", **kwargs) as f: + if start is not None: + if start >= 0: + f.seek(start) + else: + f.seek(max(0, f.size + start)) + if end is not None: + if end < 0: + end = f.size + end + return f.read(end - f.tell()) + return f.read() + + def pipe_file(self, path, value, mode="overwrite", **kwargs): + """Set the bytes of given file""" + if mode == "create" and self.exists(path): + # non-atomic but simple way; or could use "xb" in open(), which is likely + # not as well supported + raise FileExistsError + with self.open(path, "wb", **kwargs) as f: + f.write(value) + + def pipe(self, path, value=None, **kwargs): + """Put value into path + + (counterpart to ``cat``) + + Parameters + ---------- + path: string or dict(str, bytes) + If a string, a single remote location to put ``value`` bytes; if a dict, + a mapping of {path: bytesvalue}. + value: bytes, optional + If using a single path, these are the bytes to put there. Ignored if + ``path`` is a dict + """ + if isinstance(path, str): + self.pipe_file(self._strip_protocol(path), value, **kwargs) + elif isinstance(path, dict): + for k, v in path.items(): + self.pipe_file(self._strip_protocol(k), v, **kwargs) + else: + raise ValueError("path must be str or dict") + + def cat_ranges( + self, paths, starts, ends, max_gap=None, on_error="return", **kwargs + ): + """Get the contents of byte ranges from one or more files + + Parameters + ---------- + paths: list + A list of of filepaths on this filesystems + starts, ends: int or list + Bytes limits of the read. If using a single int, the same value will be + used to read all the specified files. + """ + if max_gap is not None: + raise NotImplementedError + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, list): + starts = [starts] * len(paths) + if not isinstance(ends, list): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + out = [] + for p, s, e in zip(paths, starts, ends): + try: + out.append(self.cat_file(p, s, e)) + except Exception as e: + if on_error == "return": + out.append(e) + else: + raise + return out + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + """Fetch (potentially multiple) paths' contents + + Parameters + ---------- + recursive: bool + If True, assume the path(s) are directories, and get all the + contained files + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + kwargs: passed to cat_file + + Returns + ------- + dict of {path: contents} if there are multiple paths + or the path has been otherwise expanded + """ + paths = self.expand_path(path, recursive=recursive, **kwargs) + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + out = {} + for path in paths: + try: + out[path] = self.cat_file(path, **kwargs) + except Exception as e: + if on_error == "raise": + raise + if on_error == "return": + out[path] = e + return out + else: + return self.cat_file(paths[0], **kwargs) + + def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs): + """Copy single remote file to local""" + from .implementations.local import LocalFileSystem + + if isfilelike(lpath): + outfile = lpath + elif self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) + return None + + fs = LocalFileSystem(auto_mkdir=True) + fs.makedirs(fs._parent(lpath), exist_ok=True) + + with self.open(rpath, "rb", **kwargs) as f1: + if outfile is None: + outfile = open(lpath, "wb") + + try: + callback.set_size(getattr(f1, "size", None)) + data = True + while data: + data = f1.read(self.blocksize) + segment_len = outfile.write(data) + if segment_len is None: + segment_len = len(data) + callback.relative_update(segment_len) + finally: + if not isfilelike(lpath): + outfile.close() + + def get( + self, + rpath, + lpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) to local. + + Copies a specific file or tree of files (if recursive=True). If lpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. Can submit a list of paths, which may be glob-patterns + and will be expanded. + + Calls get_file for each source. + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + from .implementations.local import ( + LocalFileSystem, + make_path_posix, + trailing_sep, + ) + + source_is_str = isinstance(rpath, str) + rpaths = self.expand_path( + rpath, recursive=recursive, maxdepth=maxdepth, **kwargs + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))] + if not rpaths: + return + + if isinstance(lpath, str): + lpath = make_path_posix(lpath) + + source_is_file = len(rpaths) == 1 + dest_is_dir = isinstance(lpath, str) and ( + trailing_sep(lpath) or LocalFileSystem().isdir(lpath) + ) + + exists = source_is_str and ( + (has_magic(rpath) and source_is_file) + or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath)) + ) + lpaths = other_paths( + rpaths, + lpath, + exists=exists, + flatten=not source_is_str, + ) + + callback.set_size(len(lpaths)) + for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): + with callback.branched(rpath, lpath) as child: + self.get_file(rpath, lpath, callback=child, **kwargs) + + def put_file( + self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs + ): + """Copy single file to remote""" + if mode == "create" and self.exists(rpath): + raise FileExistsError + if os.path.isdir(lpath): + self.makedirs(rpath, exist_ok=True) + return None + + with open(lpath, "rb") as f1: + size = f1.seek(0, 2) + callback.set_size(size) + f1.seek(0) + + self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True) + with self.open(rpath, "wb", **kwargs) as f2: + while f1.tell() < size: + data = f1.read(self.blocksize) + segment_len = f2.write(data) + if segment_len is None: + segment_len = len(data) + callback.relative_update(segment_len) + + def put( + self, + lpath, + rpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) from local. + + Copies a specific file or tree of files (if recursive=True). If rpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. + + Calls put_file for each source. + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + from .implementations.local import ( + LocalFileSystem, + make_path_posix, + trailing_sep, + ) + + source_is_str = isinstance(lpath, str) + if source_is_str: + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path( + lpath, recursive=recursive, maxdepth=maxdepth, **kwargs + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))] + if not lpaths: + return + + source_is_file = len(lpaths) == 1 + dest_is_dir = isinstance(rpath, str) and ( + trailing_sep(rpath) or self.isdir(rpath) + ) + + rpath = ( + self._strip_protocol(rpath) + if isinstance(rpath, str) + else [self._strip_protocol(p) for p in rpath] + ) + exists = source_is_str and ( + (has_magic(lpath) and source_is_file) + or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath)) + ) + rpaths = other_paths( + lpaths, + rpath, + exists=exists, + flatten=not source_is_str, + ) + + callback.set_size(len(rpaths)) + for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): + with callback.branched(lpath, rpath) as child: + self.put_file(lpath, rpath, callback=child, **kwargs) + + def head(self, path, size=1024): + """Get the first ``size`` bytes from file""" + with self.open(path, "rb") as f: + return f.read(size) + + def tail(self, path, size=1024): + """Get the last ``size`` bytes from file""" + with self.open(path, "rb") as f: + f.seek(max(-size, -f.size), 2) + return f.read() + + def cp_file(self, path1, path2, **kwargs): + raise NotImplementedError + + def copy( + self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs + ): + """Copy within two locations in the filesystem + + on_error : "raise", "ignore" + If raise, any not-found exceptions will be raised; if ignore any + not-found exceptions will cause the path to be skipped; defaults to + raise unless recursive is true, where the default is ignore + """ + if on_error is None and recursive: + on_error = "ignore" + elif on_error is None: + on_error = "raise" + + if isinstance(path1, list) and isinstance(path2, list): + # No need to expand paths when both source and destination + # are provided as lists + paths1 = path1 + paths2 = path2 + else: + from .implementations.local import trailing_sep + + source_is_str = isinstance(path1, str) + paths1 = self.expand_path( + path1, recursive=recursive, maxdepth=maxdepth, **kwargs + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))] + if not paths1: + return + + source_is_file = len(paths1) == 1 + dest_is_dir = isinstance(path2, str) and ( + trailing_sep(path2) or self.isdir(path2) + ) + + exists = source_is_str and ( + (has_magic(path1) and source_is_file) + or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1)) + ) + paths2 = other_paths( + paths1, + path2, + exists=exists, + flatten=not source_is_str, + ) + + for p1, p2 in zip(paths1, paths2): + try: + self.cp_file(p1, p2, **kwargs) + except FileNotFoundError: + if on_error == "raise": + raise + + def expand_path(self, path, recursive=False, maxdepth=None, **kwargs): + """Turn one or more globs or directories into a list of all matching paths + to files or directories. + + kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls`` + """ + + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + if isinstance(path, (str, os.PathLike)): + out = self.expand_path([path], recursive, maxdepth, **kwargs) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: + if has_magic(p): + bit = set(self.glob(p, maxdepth=maxdepth, **kwargs)) + out |= bit + if recursive: + # glob call above expanded one depth so if maxdepth is defined + # then decrement it in expand_path call below. If it is zero + # after decrementing then avoid expand_path call. + if maxdepth is not None and maxdepth <= 1: + continue + out |= set( + self.expand_path( + list(bit), + recursive=recursive, + maxdepth=maxdepth - 1 if maxdepth is not None else None, + **kwargs, + ) + ) + continue + elif recursive: + rec = set( + self.find( + p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs + ) + ) + out |= rec + if p not in out and (recursive is False or self.exists(p)): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return sorted(out) + + def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): + """Move file(s) from one location to another""" + if path1 == path2: + logger.debug("%s mv: The paths are the same, so no files were moved.", self) + else: + # explicitly raise exception to prevent data corruption + self.copy( + path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise" + ) + self.rm(path1, recursive=recursive) + + def rm_file(self, path): + """Delete a file""" + self._rm(path) + + def _rm(self, path): + """Delete one file""" + # this is the old name for the method, prefer rm_file + raise NotImplementedError + + def rm(self, path, recursive=False, maxdepth=None): + """Delete files. + + Parameters + ---------- + path: str or list of str + File(s) to delete. + recursive: bool + If file(s) are directories, recursively delete contents and then + also remove the directory + maxdepth: int or None + Depth to pass to walk for finding files to delete, if recursive. + If None, there will be no limit and infinite recursion may be + possible. + """ + path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(path): + self.rm_file(p) + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if "/" in path: + parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) + return cls.root_marker + parent + else: + return cls.root_marker + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Return raw bytes-mode file-like from the file-system""" + return AbstractBufferedFile( + self, + path, + mode, + block_size, + autocommit, + cache_options=cache_options, + **kwargs, + ) + + def open( + self, + path, + mode="rb", + block_size=None, + cache_options=None, + compression=None, + **kwargs, + ): + """ + Return a file-like object from the filesystem + + The resultant instance must function correctly in a context ``with`` + block. + + Parameters + ---------- + path: str + Target file + mode: str like 'rb', 'w' + See builtin ``open()`` + Mode "x" (exclusive write) may be implemented by the backend. Even if + it is, whether it is checked up front or on commit, and whether it is + atomic is implementation-dependent. + block_size: int + Some indication of buffering - this is a value in bytes + cache_options : dict, optional + Extra arguments to pass through to the cache. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding, errors, newline: passed on to TextIOWrapper for text mode + """ + import io + + path = self._strip_protocol(path) + if "b" not in mode: + mode = mode.replace("t", "") + "b" + + text_kwargs = { + k: kwargs.pop(k) + for k in ["encoding", "errors", "newline"] + if k in kwargs + } + return io.TextIOWrapper( + self.open( + path, + mode, + block_size=block_size, + cache_options=cache_options, + compression=compression, + **kwargs, + ), + **text_kwargs, + ) + else: + ac = kwargs.pop("autocommit", not self._intrans) + f = self._open( + path, + mode=mode, + block_size=block_size, + autocommit=ac, + cache_options=cache_options, + **kwargs, + ) + if compression is not None: + from fsspec.compression import compr + from fsspec.core import get_compression + + compression = get_compression(path, compression) + compress = compr[compression] + f = compress(f, mode=mode[0]) + + if not ac and "r" not in mode: + self.transaction.files.append(f) + return f + + def touch(self, path, truncate=True, **kwargs): + """Create empty file, or update timestamp + + Parameters + ---------- + path: str + file location + truncate: bool + If True, always set file size to 0; if False, update timestamp and + leave file unchanged, if backend allows this + """ + if truncate or not self.exists(path): + with self.open(path, "wb", **kwargs): + pass + else: + raise NotImplementedError # update timestamp, if possible + + def ukey(self, path): + """Hash of file properties, to tell if it has changed""" + return sha256(str(self.info(path)).encode()).hexdigest() + + def read_block(self, fn, offset, length, delimiter=None): + """Read a block of bytes from + + Starting at ``offset`` of the file, read ``length`` bytes. If + ``delimiter`` is set then we ensure that the read starts and stops at + delimiter boundaries that follow the locations ``offset`` and ``offset + + length``. If ``offset`` is zero then we start at zero. The + bytestring returned WILL include the end delimiter string. + + If offset+length is beyond the eof, reads to eof. + + Parameters + ---------- + fn: string + Path to filename + offset: int + Byte offset to start read + length: int + Number of bytes to read. If None, read to end. + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + + Examples + -------- + >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + Use ``length=None`` to read to the end of the file. + >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\nCharlie, 300' + + See Also + -------- + :func:`fsspec.utils.read_block` + """ + with self.open(fn, "rb") as f: + size = f.size + if length is None: + length = size + if size is not None and offset + length > size: + length = size - offset + return read_block(f, offset, length, delimiter) + + def to_json(self, *, include_password: bool = True) -> str: + """ + JSON representation of this filesystem instance. + + Parameters + ---------- + include_password: bool, default True + Whether to include the password (if any) in the output. + + Returns + ------- + JSON string with keys ``cls`` (the python location of this class), + protocol (text name of this class's protocol, first one in case of + multiple), ``args`` (positional args, usually empty), and all other + keyword arguments as their own keys. + + Warnings + -------- + Serialized filesystems may contain sensitive information which have been + passed to the constructor, such as passwords and tokens. Make sure you + store and send them in a secure environment! + """ + from .json import FilesystemJSONEncoder + + return json.dumps( + self, + cls=type( + "_FilesystemJSONEncoder", + (FilesystemJSONEncoder,), + {"include_password": include_password}, + ), + ) + + @staticmethod + def from_json(blob: str) -> AbstractFileSystem: + """ + Recreate a filesystem instance from JSON representation. + + See ``.to_json()`` for the expected structure of the input. + + Parameters + ---------- + blob: str + + Returns + ------- + file system instance, not necessarily of this particular class. + + Warnings + -------- + This can import arbitrary modules (as determined by the ``cls`` key). + Make sure you haven't installed any modules that may execute malicious code + at import time. + """ + from .json import FilesystemJSONDecoder + + return json.loads(blob, cls=FilesystemJSONDecoder) + + def to_dict(self, *, include_password: bool = True) -> dict[str, Any]: + """ + JSON-serializable dictionary representation of this filesystem instance. + + Parameters + ---------- + include_password: bool, default True + Whether to include the password (if any) in the output. + + Returns + ------- + Dictionary with keys ``cls`` (the python location of this class), + protocol (text name of this class's protocol, first one in case of + multiple), ``args`` (positional args, usually empty), and all other + keyword arguments as their own keys. + + Warnings + -------- + Serialized filesystems may contain sensitive information which have been + passed to the constructor, such as passwords and tokens. Make sure you + store and send them in a secure environment! + """ + from .json import FilesystemJSONEncoder + + json_encoder = FilesystemJSONEncoder() + + cls = type(self) + proto = self.protocol + + storage_options = dict(self.storage_options) + if not include_password: + storage_options.pop("password", None) + + return dict( + cls=f"{cls.__module__}:{cls.__name__}", + protocol=proto[0] if isinstance(proto, (tuple, list)) else proto, + args=json_encoder.make_serializable(self.storage_args), + **json_encoder.make_serializable(storage_options), + ) + + @staticmethod + def from_dict(dct: dict[str, Any]) -> AbstractFileSystem: + """ + Recreate a filesystem instance from dictionary representation. + + See ``.to_dict()`` for the expected structure of the input. + + Parameters + ---------- + dct: Dict[str, Any] + + Returns + ------- + file system instance, not necessarily of this particular class. + + Warnings + -------- + This can import arbitrary modules (as determined by the ``cls`` key). + Make sure you haven't installed any modules that may execute malicious code + at import time. + """ + from .json import FilesystemJSONDecoder + + json_decoder = FilesystemJSONDecoder() + + dct = dict(dct) # Defensive copy + + cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct) + if cls is None: + raise ValueError("Not a serialized AbstractFileSystem") + + dct.pop("cls", None) + dct.pop("protocol", None) + + return cls( + *json_decoder.unmake_serializable(dct.pop("args", ())), + **json_decoder.unmake_serializable(dct), + ) + + def _get_pyarrow_filesystem(self): + """ + Make a version of the FS instance which will be acceptable to pyarrow + """ + # all instances already also derive from pyarrow + return self + + def get_mapper(self, root="", check=False, create=False, missing_exceptions=None): + """Create key/value store based on this file-system + + Makes a MutableMapping interface to the FS at the given root path. + See ``fsspec.mapping.FSMap`` for further details. + """ + from .mapping import FSMap + + return FSMap( + root, + self, + check=check, + create=create, + missing_exceptions=missing_exceptions, + ) + + @classmethod + def clear_instance_cache(cls): + """ + Clear the cache of filesystem instances. + + Notes + ----- + Unless overridden by setting the ``cachable`` class attribute to False, + the filesystem class stores a reference to newly created instances. This + prevents Python's normal rules around garbage collection from working, + since the instances refcount will not drop to zero until + ``clear_instance_cache`` is called. + """ + cls._cache.clear() + + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + raise NotImplementedError + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + raise NotImplementedError + + def tree( + self, + path: str = "/", + recursion_limit: int = 2, + max_display: int = 25, + display_size: bool = False, + prefix: str = "", + is_last: bool = True, + first: bool = True, + indent_size: int = 4, + ) -> str: + """ + Return a tree-like structure of the filesystem starting from the given path as a string. + + Parameters + ---------- + path: Root path to start traversal from + recursion_limit: Maximum depth of directory traversal + max_display: Maximum number of items to display per directory + display_size: Whether to display file sizes + prefix: Current line prefix for visual tree structure + is_last: Whether current item is last in its level + first: Whether this is the first call (displays root path) + indent_size: Number of spaces by indent + + Returns + ------- + str: A string representing the tree structure. + + Example + ------- + >>> from fsspec import filesystem + + >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password') + >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10) + >>> print(tree) + """ + + def format_bytes(n: int) -> str: + """Format bytes as text.""" + for prefix, k in ( + ("P", 2**50), + ("T", 2**40), + ("G", 2**30), + ("M", 2**20), + ("k", 2**10), + ): + if n >= 0.9 * k: + return f"{n / k:.2f} {prefix}b" + return f"{n}B" + + result = [] + + if first: + result.append(path) + + if recursion_limit: + indent = " " * indent_size + contents = self.ls(path, detail=True) + contents.sort( + key=lambda x: (x.get("type") != "directory", x.get("name", "")) + ) + + if max_display is not None and len(contents) > max_display: + displayed_contents = contents[:max_display] + remaining_count = len(contents) - max_display + else: + displayed_contents = contents + remaining_count = 0 + + for i, item in enumerate(displayed_contents): + is_last_item = (i == len(displayed_contents) - 1) and ( + remaining_count == 0 + ) + + branch = ( + "└" + ("─" * (indent_size - 2)) + if is_last_item + else "├" + ("─" * (indent_size - 2)) + ) + branch += " " + new_prefix = prefix + ( + indent if is_last_item else "│" + " " * (indent_size - 1) + ) + + name = os.path.basename(item.get("name", "")) + + if display_size and item.get("type") == "directory": + sub_contents = self.ls(item.get("name", ""), detail=True) + num_files = sum( + 1 for sub_item in sub_contents if sub_item.get("type") == "file" + ) + num_folders = sum( + 1 + for sub_item in sub_contents + if sub_item.get("type") == "directory" + ) + + if num_files == 0 and num_folders == 0: + size = " (empty folder)" + elif num_files == 0: + size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})" + elif num_folders == 0: + size = f" ({num_files} file{'s' if num_files > 1 else ''})" + else: + size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})" + elif display_size and item.get("type") == "file": + size = f" ({format_bytes(item.get('size', 0))})" + else: + size = "" + + result.append(f"{prefix}{branch}{name}{size}") + + if item.get("type") == "directory" and recursion_limit > 0: + result.append( + self.tree( + path=item.get("name", ""), + recursion_limit=recursion_limit - 1, + max_display=max_display, + display_size=display_size, + prefix=new_prefix, + is_last=is_last_item, + first=False, + indent_size=indent_size, + ) + ) + + if remaining_count > 0: + more_message = f"{remaining_count} more item(s) not displayed." + result.append( + f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}" + ) + + return "\n".join(_ for _ in result if _) + + # ------------------------------------------------------------------------ + # Aliases + + def read_bytes(self, path, start=None, end=None, **kwargs): + """Alias of `AbstractFileSystem.cat_file`.""" + return self.cat_file(path, start=start, end=end, **kwargs) + + def write_bytes(self, path, value, **kwargs): + """Alias of `AbstractFileSystem.pipe_file`.""" + self.pipe_file(path, value, **kwargs) + + def makedir(self, path, create_parents=True, **kwargs): + """Alias of `AbstractFileSystem.mkdir`.""" + return self.mkdir(path, create_parents=create_parents, **kwargs) + + def mkdirs(self, path, exist_ok=False): + """Alias of `AbstractFileSystem.makedirs`.""" + return self.makedirs(path, exist_ok=exist_ok) + + def listdir(self, path, detail=True, **kwargs): + """Alias of `AbstractFileSystem.ls`.""" + return self.ls(path, detail=detail, **kwargs) + + def cp(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.copy`.""" + return self.copy(path1, path2, **kwargs) + + def move(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.mv`.""" + return self.mv(path1, path2, **kwargs) + + def stat(self, path, **kwargs): + """Alias of `AbstractFileSystem.info`.""" + return self.info(path, **kwargs) + + def disk_usage(self, path, total=True, maxdepth=None, **kwargs): + """Alias of `AbstractFileSystem.du`.""" + return self.du(path, total=total, maxdepth=maxdepth, **kwargs) + + def rename(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.mv`.""" + return self.mv(path1, path2, **kwargs) + + def delete(self, path, recursive=False, maxdepth=None): + """Alias of `AbstractFileSystem.rm`.""" + return self.rm(path, recursive=recursive, maxdepth=maxdepth) + + def upload(self, lpath, rpath, recursive=False, **kwargs): + """Alias of `AbstractFileSystem.put`.""" + return self.put(lpath, rpath, recursive=recursive, **kwargs) + + def download(self, rpath, lpath, recursive=False, **kwargs): + """Alias of `AbstractFileSystem.get`.""" + return self.get(rpath, lpath, recursive=recursive, **kwargs) + + def sign(self, path, expiration=100, **kwargs): + """Create a signed URL representing the given path + + Some implementations allow temporary URLs to be generated, as a + way of delegating credentials. + + Parameters + ---------- + path : str + The path on the filesystem + expiration : int + Number of seconds to enable the URL for (if supported) + + Returns + ------- + URL : str + The signed URL + + Raises + ------ + NotImplementedError : if method is not implemented for a filesystem + """ + raise NotImplementedError("Sign is not implemented for this filesystem") + + def _isfilestore(self): + # Originally inherited from pyarrow DaskFileSystem. Keeping this + # here for backwards compatibility as long as pyarrow uses its + # legacy fsspec-compatible filesystems and thus accepts fsspec + # filesystems as well + return False + + +class AbstractBufferedFile(io.IOBase): + """Convenient class to derive from to provide buffering + + In the case that the backend does not provide a pythonic file-like object + already, this class contains much of the logic to build one. The only + methods that need to be overridden are ``_upload_chunk``, + ``_initiate_upload`` and ``_fetch_range``. + """ + + DEFAULT_BLOCK_SIZE = 5 * 2**20 + _details = None + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + size=None, + **kwargs, + ): + """ + Template for files with buffered reading and writing + + Parameters + ---------- + fs: instance of FileSystem + path: str + location in file-system + mode: str + Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file + systems may be read-only, and some may not support append. + block_size: int + Buffer size for reading or writing, 'default' for class default + autocommit: bool + Whether to write to final destination; may only impact what + happens when file is being closed. + cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" + Caching policy in read mode. See the definitions in ``core``. + cache_options : dict + Additional options passed to the constructor for the cache specified + by `cache_type`. + size: int + If given and in read mode, suppressed having to look up the file size + kwargs: + Gets stored as self.kwargs + """ + from .core import caches + + self.path = path + self.fs = fs + self.mode = mode + self.blocksize = ( + self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size + ) + self.loc = 0 + self.autocommit = autocommit + self.end = None + self.start = None + self.closed = False + + if cache_options is None: + cache_options = {} + + if "trim" in kwargs: + warnings.warn( + "Passing 'trim' to control the cache behavior has been deprecated. " + "Specify it within the 'cache_options' argument instead.", + FutureWarning, + ) + cache_options["trim"] = kwargs.pop("trim") + + self.kwargs = kwargs + + if mode not in {"ab", "rb", "wb", "xb"}: + raise NotImplementedError("File mode not supported") + if mode == "rb": + if size is not None: + self.size = size + else: + self.size = self.details["size"] + self.cache = caches[cache_type]( + self.blocksize, self._fetch_range, self.size, **cache_options + ) + else: + self.buffer = io.BytesIO() + self.offset = None + self.forced = False + self.location = None + + @property + def details(self): + if self._details is None: + self._details = self.fs.info(self.path) + return self._details + + @details.setter + def details(self, value): + self._details = value + self.size = value["size"] + + @property + def full_name(self): + return _unstrip_protocol(self.path, self.fs) + + @property + def closed(self): + # get around this attr being read-only in IOBase + # use getattr here, since this can be called during del + return getattr(self, "_closed", True) + + @closed.setter + def closed(self, c): + self._closed = c + + def __hash__(self): + if "w" in self.mode: + return id(self) + else: + return int(tokenize(self.details), 16) + + def __eq__(self, other): + """Files are equal if they have the same checksum, only in read mode""" + if self is other: + return True + return ( + isinstance(other, type(self)) + and self.mode == "rb" + and other.mode == "rb" + and hash(self) == hash(other) + ) + + def commit(self): + """Move from temp to final destination""" + + def discard(self): + """Throw away temporary file""" + + def info(self): + """File information about this path""" + if self.readable(): + return self.details + else: + raise ValueError("Info not available while writing") + + def tell(self): + """Current file location""" + return self.loc + + def seek(self, loc, whence=0): + """Set current file location + + Parameters + ---------- + loc: int + byte location + whence: {0, 1, 2} + from start of file, current location or end of file, resp. + """ + loc = int(loc) + if not self.mode == "rb": + raise OSError(ESPIPE, "Seek only available in read mode") + if whence == 0: + nloc = loc + elif whence == 1: + nloc = self.loc + loc + elif whence == 2: + nloc = self.size + loc + else: + raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)") + if nloc < 0: + raise ValueError("Seek before start of file") + self.loc = nloc + return self.loc + + def write(self, data): + """ + Write data to buffer. + + Buffer only sent on flush() or if buffer is greater than + or equal to blocksize. + + Parameters + ---------- + data: bytes + Set of bytes to be written. + """ + if not self.writable(): + raise ValueError("File not in write mode") + if self.closed: + raise ValueError("I/O operation on closed file.") + if self.forced: + raise ValueError("This file has been force-flushed, can only close") + out = self.buffer.write(data) + self.loc += out + if self.buffer.tell() >= self.blocksize: + self.flush() + return out + + def flush(self, force=False): + """ + Write buffered data to backend store. + + Writes the current buffer, if it is larger than the block-size, or if + the file is being closed. + + Parameters + ---------- + force: bool + When closing, write the last block even if it is smaller than + blocks are allowed to be. Disallows further writing to this file. + """ + + if self.closed: + raise ValueError("Flush on closed file") + if force and self.forced: + raise ValueError("Force flush cannot be called more than once") + if force: + self.forced = True + + if self.readable(): + # no-op to flush on read-mode + return + + if not force and self.buffer.tell() < self.blocksize: + # Defer write on small block + return + + if self.offset is None: + # Initialize a multipart upload + self.offset = 0 + try: + self._initiate_upload() + except: + self.closed = True + raise + + if self._upload_chunk(final=force) is not False: + self.offset += self.buffer.seek(0, 2) + self.buffer = io.BytesIO() + + def _upload_chunk(self, final=False): + """Write one part of a multi-block file upload + + Parameters + ========== + final: bool + This is the last block, so should complete file, if + self.autocommit is True. + """ + # may not yet have been initialized, may need to call _initialize_upload + + def _initiate_upload(self): + """Create remote file/upload""" + pass + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + return self.fs.cat_file(self.path, start=start, end=end) + + def read(self, length=-1): + """ + Return data from cache, or fetch pieces as necessary + + Parameters + ---------- + length: int (-1) + Number of bytes to read; if <0, all remaining bytes. + """ + length = -1 if length is None else int(length) + if self.mode != "rb": + raise ValueError("File not in read mode") + if length < 0: + length = self.size - self.loc + if self.closed: + raise ValueError("I/O operation on closed file.") + if length == 0: + # don't even bother calling fetch + return b"" + out = self.cache._fetch(self.loc, self.loc + length) + + logger.debug( + "%s read: %i - %i %s", + self, + self.loc, + self.loc + length, + self.cache._log_stats(), + ) + self.loc += len(out) + return out + + def readinto(self, b): + """mirrors builtin file's readinto method + + https://docs.python.org/3/library/io.html#io.RawIOBase.readinto + """ + out = memoryview(b).cast("B") + data = self.read(out.nbytes) + out[: len(data)] = data + return len(data) + + def readuntil(self, char=b"\n", blocks=None): + """Return data between current position and first occurrence of char + + char is included in the output, except if the end of the tile is + encountered first. + + Parameters + ---------- + char: bytes + Thing to find + blocks: None or int + How much to read in each go. Defaults to file blocksize - which may + mean a new read on every call. + """ + out = [] + while True: + start = self.tell() + part = self.read(blocks or self.blocksize) + if len(part) == 0: + break + found = part.find(char) + if found > -1: + out.append(part[: found + len(char)]) + self.seek(start + found + len(char)) + break + out.append(part) + return b"".join(out) + + def readline(self): + """Read until and including the first occurrence of newline character + + Note that, because of character encoding, this is not necessarily a + true line ending. + """ + return self.readuntil(b"\n") + + def __next__(self): + out = self.readline() + if out: + return out + raise StopIteration + + def __iter__(self): + return self + + def readlines(self): + """Return all data, split by the newline character, including the newline character""" + data = self.read() + lines = data.split(b"\n") + out = [l + b"\n" for l in lines[:-1]] + if data.endswith(b"\n"): + return out + else: + return out + [lines[-1]] + # return list(self) ??? + + def readinto1(self, b): + return self.readinto(b) + + def close(self): + """Close file + + Finalizes writes, discards cache + """ + if getattr(self, "_unclosable", False): + return + if self.closed: + return + try: + if self.mode == "rb": + self.cache = None + else: + if not self.forced: + self.flush(force=True) + + if self.fs is not None: + self.fs.invalidate_cache(self.path) + self.fs.invalidate_cache(self.fs._parent(self.path)) + finally: + self.closed = True + + def readable(self): + """Whether opened for reading""" + return "r" in self.mode and not self.closed + + def seekable(self): + """Whether is seekable (only in read mode)""" + return self.readable() + + def writable(self): + """Whether opened for writing""" + return self.mode in {"wb", "ab", "xb"} and not self.closed + + def __reduce__(self): + if self.mode != "rb": + raise RuntimeError("Pickling a writeable file is not supported") + + return reopen, ( + self.fs, + self.path, + self.mode, + self.blocksize, + self.loc, + self.size, + self.autocommit, + self.cache.name if self.cache else "none", + self.kwargs, + ) + + def __del__(self): + if not self.closed: + self.close() + + def __str__(self): + return f"" + + __repr__ = __str__ + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + +def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs): + file = fs.open( + path, + mode=mode, + block_size=blocksize, + autocommit=autocommit, + cache_type=cache_type, + size=size, + **kwargs, + ) + if loc > 0: + file.seek(loc) + return file diff --git a/env/lib/python3.13/site-packages/fsspec/transaction.py b/env/lib/python3.13/site-packages/fsspec/transaction.py new file mode 100644 index 0000000000000000000000000000000000000000..77293f63ecc5f611e19d849ef236d53e9c258efc --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/transaction.py @@ -0,0 +1,90 @@ +from collections import deque + + +class Transaction: + """Filesystem transaction write context + + Gathers files for deferred commit or discard, so that several write + operations can be finalized semi-atomically. This works by having this + instance as the ``.transaction`` attribute of the given filesystem + """ + + def __init__(self, fs, **kwargs): + """ + Parameters + ---------- + fs: FileSystem instance + """ + self.fs = fs + self.files = deque() + + def __enter__(self): + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """End transaction and commit, if exit is not due to exception""" + # only commit if there was no exception + self.complete(commit=exc_type is None) + if self.fs: + self.fs._intrans = False + self.fs._transaction = None + self.fs = None + + def start(self): + """Start a transaction on this FileSystem""" + self.files = deque() # clean up after previous failed completions + self.fs._intrans = True + + def complete(self, commit=True): + """Finish transaction: commit or discard all deferred files""" + while self.files: + f = self.files.popleft() + if commit: + f.commit() + else: + f.discard() + self.fs._intrans = False + self.fs._transaction = None + self.fs = None + + +class FileActor: + def __init__(self): + self.files = [] + + def commit(self): + for f in self.files: + f.commit() + self.files.clear() + + def discard(self): + for f in self.files: + f.discard() + self.files.clear() + + def append(self, f): + self.files.append(f) + + +class DaskTransaction(Transaction): + def __init__(self, fs): + """ + Parameters + ---------- + fs: FileSystem instance + """ + import distributed + + super().__init__(fs) + client = distributed.default_client() + self.files = client.submit(FileActor, actor=True).result() + + def complete(self, commit=True): + """Finish transaction: commit or discard all deferred files""" + if commit: + self.files.commit().result() + else: + self.files.discard().result() + self.fs._intrans = False + self.fs = None diff --git a/env/lib/python3.13/site-packages/fsspec/utils.py b/env/lib/python3.13/site-packages/fsspec/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1e5e5e70bc5a5887f3a44674c69345283cc4372b --- /dev/null +++ b/env/lib/python3.13/site-packages/fsspec/utils.py @@ -0,0 +1,738 @@ +from __future__ import annotations + +import contextlib +import logging +import math +import os +import re +import sys +import tempfile +from collections.abc import Callable, Iterable, Iterator, Sequence +from functools import partial +from hashlib import md5 +from importlib.metadata import version +from typing import IO, TYPE_CHECKING, Any, TypeVar +from urllib.parse import urlsplit + +if TYPE_CHECKING: + import pathlib + from typing import TypeGuard + + from fsspec.spec import AbstractFileSystem + + +DEFAULT_BLOCK_SIZE = 5 * 2**20 + +T = TypeVar("T") + + +def infer_storage_options( + urlpath: str, inherit_storage_options: dict[str, Any] | None = None +) -> dict[str, Any]: + """Infer storage options from URL path and merge it with existing storage + options. + + Parameters + ---------- + urlpath: str or unicode + Either local absolute file path or URL (hdfs://namenode:8020/file.csv) + inherit_storage_options: dict (optional) + Its contents will get merged with the inferred information from the + given path + + Returns + ------- + Storage options dict. + + Examples + -------- + >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP + {"protocol": "file", "path", "/mnt/datasets/test.csv"} + >>> infer_storage_options( + ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', + ... inherit_storage_options={'extra': 'value'}, + ... ) # doctest: +SKIP + {"protocol": "hdfs", "username": "username", "password": "pwd", + "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", + "url_query": "q=1", "extra": "value"} + """ + # Handle Windows paths including disk name in this special case + if ( + re.match(r"^[a-zA-Z]:[\\/]", urlpath) + or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None + ): + return {"protocol": "file", "path": urlpath} + + parsed_path = urlsplit(urlpath) + protocol = parsed_path.scheme or "file" + if parsed_path.fragment: + path = "#".join([parsed_path.path, parsed_path.fragment]) + else: + path = parsed_path.path + if protocol == "file": + # Special case parsing file protocol URL on Windows according to: + # https://msdn.microsoft.com/en-us/library/jj710207.aspx + windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) + if windows_path: + drive, path = windows_path.groups() + path = f"{drive}:{path}" + + if protocol in ["http", "https"]: + # for HTTP, we don't want to parse, as requests will anyway + return {"protocol": protocol, "path": urlpath} + + options: dict[str, Any] = {"protocol": protocol, "path": path} + + if parsed_path.netloc: + # Parse `hostname` from netloc manually because `parsed_path.hostname` + # lowercases the hostname which is not always desirable (e.g. in S3): + # https://github.com/dask/dask/issues/1417 + options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] + + if protocol in ("s3", "s3a", "gcs", "gs"): + options["path"] = options["host"] + options["path"] + else: + options["host"] = options["host"] + if parsed_path.port: + options["port"] = parsed_path.port + if parsed_path.username: + options["username"] = parsed_path.username + if parsed_path.password: + options["password"] = parsed_path.password + + if parsed_path.query: + options["url_query"] = parsed_path.query + if parsed_path.fragment: + options["url_fragment"] = parsed_path.fragment + + if inherit_storage_options: + update_storage_options(options, inherit_storage_options) + + return options + + +def update_storage_options( + options: dict[str, Any], inherited: dict[str, Any] | None = None +) -> None: + if not inherited: + inherited = {} + collisions = set(options) & set(inherited) + if collisions: + for collision in collisions: + if options.get(collision) != inherited.get(collision): + raise KeyError( + f"Collision between inferred and specified storage " + f"option:\n{collision}" + ) + options.update(inherited) + + +# Compression extensions registered via fsspec.compression.register_compression +compressions: dict[str, str] = {} + + +def infer_compression(filename: str) -> str | None: + """Infer compression, if available, from filename. + + Infer a named compression type, if registered and available, from filename + extension. This includes builtin (gz, bz2, zip) compressions, as well as + optional compressions. See fsspec.compression.register_compression. + """ + extension = os.path.splitext(filename)[-1].strip(".").lower() + if extension in compressions: + return compressions[extension] + return None + + +def build_name_function(max_int: float) -> Callable[[int], str]: + """Returns a function that receives a single integer + and returns it as a string padded by enough zero characters + to align with maximum possible integer + + >>> name_f = build_name_function(57) + + >>> name_f(7) + '07' + >>> name_f(31) + '31' + >>> build_name_function(1000)(42) + '0042' + >>> build_name_function(999)(42) + '042' + >>> build_name_function(0)(0) + '0' + """ + # handle corner cases max_int is 0 or exact power of 10 + max_int += 1e-8 + + pad_length = int(math.ceil(math.log10(max_int))) + + def name_function(i: int) -> str: + return str(i).zfill(pad_length) + + return name_function + + +def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool: + r"""Seek current file to file start, file end, or byte after delimiter seq. + + Seeks file to next chunk delimiter, where chunks are defined on file start, + a delimiting sequence, and file end. Use file.tell() to see location afterwards. + Note that file start is a valid split, so must be at offset > 0 to seek for + delimiter. + + Parameters + ---------- + file: a file + delimiter: bytes + a delimiter like ``b'\n'`` or message sentinel, matching file .read() type + blocksize: int + Number of bytes to read from the file at once. + + + Returns + ------- + Returns True if a delimiter was found, False if at file start or end. + + """ + + if file.tell() == 0: + # beginning-of-file, return without seek + return False + + # Interface is for binary IO, with delimiter as bytes, but initialize last + # with result of file.read to preserve compatibility with text IO. + last: bytes | None = None + while True: + current = file.read(blocksize) + if not current: + # end-of-file without delimiter + return False + full = last + current if last else current + try: + if delimiter in full: + i = full.index(delimiter) + file.seek(file.tell() - (len(full) - i) + len(delimiter)) + return True + elif len(current) < blocksize: + # end-of-file without delimiter + return False + except (OSError, ValueError): + pass + last = full[-len(delimiter) :] + + +def read_block( + f: IO[bytes], + offset: int, + length: int | None, + delimiter: bytes | None = None, + split_before: bool = False, +) -> bytes: + """Read a block of bytes from a file + + Parameters + ---------- + f: File + Open file + offset: int + Byte offset to start read + length: int + Number of bytes to read, read through end of file if None + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + split_before: bool (optional) + Start/stop read *before* delimiter bytestring. + + + If using the ``delimiter=`` keyword argument we ensure that the read + starts and stops at delimiter boundaries that follow the locations + ``offset`` and ``offset + length``. If ``offset`` is zero then we + start at zero, regardless of delimiter. The bytestring returned WILL + include the terminating delimiter string. + + Examples + -------- + + >>> from io import BytesIO # doctest: +SKIP + >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP + >>> read_block(f, 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + + >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP + b'Bob, 200\\nCharlie, 300' + """ + if delimiter: + f.seek(offset) + found_start_delim = seek_delimiter(f, delimiter, 2**16) + if length is None: + return f.read() + start = f.tell() + length -= start - offset + + f.seek(start + length) + found_end_delim = seek_delimiter(f, delimiter, 2**16) + end = f.tell() + + # Adjust split location to before delimiter if seek found the + # delimiter sequence, not start or end of file. + if found_start_delim and split_before: + start -= len(delimiter) + + if found_end_delim and split_before: + end -= len(delimiter) + + offset = start + length = end - start + + f.seek(offset) + + # TODO: allow length to be None and read to the end of the file? + assert length is not None + b = f.read(length) + return b + + +def tokenize(*args: Any, **kwargs: Any) -> str: + """Deterministic token + + (modified from dask.base) + + >>> tokenize([1, 2, '3']) + '9d71491b50023b06fc76928e6eddb952' + + >>> tokenize('Hello') == tokenize('Hello') + True + """ + if kwargs: + args += (kwargs,) + try: + h = md5(str(args).encode()) + except ValueError: + # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380 + h = md5(str(args).encode(), usedforsecurity=False) + return h.hexdigest() + + +def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str: + """Attempt to convert a path-like object to a string. + + Parameters + ---------- + filepath: object to be converted + + Returns + ------- + filepath_str: maybe a string version of the object + + Notes + ----- + Objects supporting the fspath protocol are coerced according to its + __fspath__ method. + + For backwards compatibility with older Python version, pathlib.Path + objects are specially coerced. + + Any other object is passed through unchanged, which includes bytes, + strings, buffers, or anything else that's not even path-like. + """ + if isinstance(filepath, str): + return filepath + elif hasattr(filepath, "__fspath__"): + return filepath.__fspath__() + elif hasattr(filepath, "path"): + return filepath.path + else: + return filepath # type: ignore[return-value] + + +def make_instance( + cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any] +) -> T: + inst = cls(*args, **kwargs) + inst._determine_worker() # type: ignore[attr-defined] + return inst + + +def common_prefix(paths: Iterable[str]) -> str: + """For a list of paths, find the shortest prefix common to all""" + parts = [p.split("/") for p in paths] + lmax = min(len(p) for p in parts) + end = 0 + for i in range(lmax): + end = all(p[i] == parts[0][i] for p in parts) + if not end: + break + i += end + return "/".join(parts[0][:i]) + + +def other_paths( + paths: list[str], + path2: str | list[str], + exists: bool = False, + flatten: bool = False, +) -> list[str]: + """In bulk file operations, construct a new file tree from a list of files + + Parameters + ---------- + paths: list of str + The input file tree + path2: str or list of str + Root to construct the new list in. If this is already a list of str, we just + assert it has the right number of elements. + exists: bool (optional) + For a str destination, it is already exists (and is a dir), files should + end up inside. + flatten: bool (optional) + Whether to flatten the input directory tree structure so that the output files + are in the same directory. + + Returns + ------- + list of str + """ + + if isinstance(path2, str): + path2 = path2.rstrip("/") + + if flatten: + path2 = ["/".join((path2, p.split("/")[-1])) for p in paths] + else: + cp = common_prefix(paths) + if exists: + cp = cp.rsplit("/", 1)[0] + if not cp and all(not s.startswith("/") for s in paths): + path2 = ["/".join([path2, p]) for p in paths] + else: + path2 = [p.replace(cp, path2, 1) for p in paths] + else: + assert len(paths) == len(path2) + return path2 + + +def is_exception(obj: Any) -> bool: + return isinstance(obj, BaseException) + + +def isfilelike(f: Any) -> TypeGuard[IO[bytes]]: + return all(hasattr(f, attr) for attr in ["read", "close", "tell"]) + + +def get_protocol(url: str) -> str: + url = stringify_path(url) + parts = re.split(r"(\:\:|\://)", url, maxsplit=1) + if len(parts) > 1: + return parts[0] + return "file" + + +def get_file_extension(url: str) -> str: + url = stringify_path(url) + ext_parts = url.rsplit(".", 1) + if len(ext_parts) > 1: + return ext_parts[-1] + return "" + + +def can_be_local(path: str) -> bool: + """Can the given URL be used with open_local?""" + from fsspec import get_filesystem_class + + try: + return getattr(get_filesystem_class(get_protocol(path)), "local_file", False) + except (ValueError, ImportError): + # not in registry or import failed + return False + + +def get_package_version_without_import(name: str) -> str | None: + """For given package name, try to find the version without importing it + + Import and package.__version__ is still the backup here, so an import + *might* happen. + + Returns either the version string, or None if the package + or the version was not readily found. + """ + if name in sys.modules: + mod = sys.modules[name] + if hasattr(mod, "__version__"): + return mod.__version__ + try: + return version(name) + except: # noqa: E722 + pass + try: + import importlib + + mod = importlib.import_module(name) + return mod.__version__ + except (ImportError, AttributeError): + return None + + +def setup_logging( + logger: logging.Logger | None = None, + logger_name: str | None = None, + level: str = "DEBUG", + clear: bool = True, +) -> logging.Logger: + if logger is None and logger_name is None: + raise ValueError("Provide either logger object or logger name") + logger = logger or logging.getLogger(logger_name) + handle = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s" + ) + handle.setFormatter(formatter) + if clear: + logger.handlers.clear() + logger.addHandler(handle) + logger.setLevel(level) + return logger + + +def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str: + return fs.unstrip_protocol(name) + + +def mirror_from( + origin_name: str, methods: Iterable[str] +) -> Callable[[type[T]], type[T]]: + """Mirror attributes and methods from the given + origin_name attribute of the instance to the + decorated class""" + + def origin_getter(method: str, self: Any) -> Any: + origin = getattr(self, origin_name) + return getattr(origin, method) + + def wrapper(cls: type[T]) -> type[T]: + for method in methods: + wrapped_method = partial(origin_getter, method) + setattr(cls, method, property(wrapped_method)) + return cls + + return wrapper + + +@contextlib.contextmanager +def nullcontext(obj: T) -> Iterator[T]: + yield obj + + +def merge_offset_ranges( + paths: list[str], + starts: list[int] | int, + ends: list[int] | int, + max_gap: int = 0, + max_block: int | None = None, + sort: bool = True, +) -> tuple[list[str], list[int], list[int]]: + """Merge adjacent byte-offset ranges when the inter-range + gap is <= `max_gap`, and when the merged byte range does not + exceed `max_block` (if specified). By default, this function + will re-order the input paths and byte ranges to ensure sorted + order. If the user can guarantee that the inputs are already + sorted, passing `sort=False` will skip the re-ordering. + """ + # Check input + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, list): + starts = [starts] * len(paths) + if not isinstance(ends, list): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + + # Early Return + if len(starts) <= 1: + return paths, starts, ends + + starts = [s or 0 for s in starts] + # Sort by paths and then ranges if `sort=True` + if sort: + paths, starts, ends = ( + list(v) + for v in zip( + *sorted( + zip(paths, starts, ends), + ) + ) + ) + + if paths: + # Loop through the coupled `paths`, `starts`, and + # `ends`, and merge adjacent blocks when appropriate + new_paths = paths[:1] + new_starts = starts[:1] + new_ends = ends[:1] + for i in range(1, len(paths)): + if paths[i] == paths[i - 1] and new_ends[-1] is None: + continue + elif ( + paths[i] != paths[i - 1] + or ((starts[i] - new_ends[-1]) > max_gap) + or (max_block is not None and (ends[i] - new_starts[-1]) > max_block) + ): + # Cannot merge with previous block. + # Add new `paths`, `starts`, and `ends` elements + new_paths.append(paths[i]) + new_starts.append(starts[i]) + new_ends.append(ends[i]) + else: + # Merge with previous block by updating the + # last element of `ends` + new_ends[-1] = ends[i] + return new_paths, new_starts, new_ends + + # `paths` is empty. Just return input lists + return paths, starts, ends + + +def file_size(filelike: IO[bytes]) -> int: + """Find length of any open read-mode file-like""" + pos = filelike.tell() + try: + return filelike.seek(0, 2) + finally: + filelike.seek(pos) + + +@contextlib.contextmanager +def atomic_write(path: str, mode: str = "wb"): + """ + A context manager that opens a temporary file next to `path` and, on exit, + replaces `path` with the temporary file, thereby updating `path` + atomically. + """ + fd, fn = tempfile.mkstemp( + dir=os.path.dirname(path), prefix=os.path.basename(path) + "-" + ) + try: + with open(fd, mode) as fp: + yield fp + except BaseException: + with contextlib.suppress(FileNotFoundError): + os.unlink(fn) + raise + else: + os.replace(fn, path) + + +def _translate(pat, STAR, QUESTION_MARK): + # Copied from: https://github.com/python/cpython/pull/106703. + res: list[str] = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i + 1 + if c == "*": + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == "?": + add(QUESTION_MARK) + elif c == "[": + j = i + if j < n and pat[j] == "!": + j = j + 1 + if j < n and pat[j] == "]": + j = j + 1 + while j < n and pat[j] != "]": + j = j + 1 + if j >= n: + add("\\[") + else: + stuff = pat[i:j] + if "-" not in stuff: + stuff = stuff.replace("\\", r"\\") + else: + chunks = [] + k = i + 2 if pat[i] == "!" else i + 1 + while True: + k = pat.find("-", k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k + 1 + k = k + 3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += "-" + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks) - 1, 0, -1): + if chunks[k - 1][-1] > chunks[k][0]: + chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = "-".join( + s.replace("\\", r"\\").replace("-", r"\-") for s in chunks + ) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r"([&~|])", r"\\\1", stuff) + i = j + 1 + if not stuff: + # Empty range: never match. + add("(?!)") + elif stuff == "!": + # Negated empty range: match any character. + add(".") + else: + if stuff[0] == "!": + stuff = "^" + stuff[1:] + elif stuff[0] in ("^", "["): + stuff = "\\" + stuff + add(f"[{stuff}]") + else: + add(re.escape(c)) + assert i == n + return res + + +def glob_translate(pat): + # Copied from: https://github.com/python/cpython/pull/106703. + # The keyword parameters' values are fixed to: + # recursive=True, include_hidden=True, seps=None + """Translate a pathname with shell wildcards to a regular expression.""" + if os.path.altsep: + seps = os.path.sep + os.path.altsep + else: + seps = os.path.sep + escaped_seps = "".join(map(re.escape, seps)) + any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps + not_sep = f"[^{escaped_seps}]" + one_last_segment = f"{not_sep}+" + one_segment = f"{one_last_segment}{any_sep}" + any_segments = f"(?:.+{any_sep})?" + any_last_segments = ".*" + results = [] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if part == "*": + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue + if part == "**": + results.append(any_segments if idx < last_part_idx else any_last_segments) + continue + elif "**" in part: + raise ValueError( + "Invalid pattern: '**' can only be an entire path component" + ) + if part: + results.extend(_translate(part, f"{not_sep}*", not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = "".join(results) + return rf"(?s:{res})\Z" diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/INSTALLER b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/METADATA b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..0427686e9ed1fd7a85f5d0dd347d5987bd889196 --- /dev/null +++ b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/METADATA @@ -0,0 +1,48 @@ +Metadata-Version: 2.1 +Name: git-lfs +Version: 1.6 +Summary: A lightweight Git Large File Storage fetcher +Home-page: https://github.com/liberapay/git-lfs-fetch.py +Author: Changaco +Author-email: changaco@changaco.oy.lc +License: CC0 +Keywords: git lfs +Platform: UNKNOWN +Description-Content-Type: text/x-rst + +A lightweight Git Large File Storage fetcher written in python. + +This module cannot fully replace the official git-lfs client, it only knows how +to download the files, cache them (the same way the official client does), and +place them in a checkout directory. Uploading files is not implemented at all. + +Installation +============ + + pip install git-lfs + +python-git-lfs is compatible with python 2 and 3. + +Usage +===== + +Basic: simply run ``python -m git_lfs`` in a normal Git repository. + +Advanced:: + + python -m git_lfs [-h] [-v] [git_repo] [checkout_dir] + + positional arguments: + git_repo if it's bare you need to provide a checkout_dir + checkout_dir + + optional arguments: + -h, --help show this help message and exit + -v, --verbose + +License +======= + +`CC0 Public Domain Dedication `_ + + diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/RECORD b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..1551a84f6a11725d8518cee2f8531317075084d9 --- /dev/null +++ b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/RECORD @@ -0,0 +1,12 @@ +git_lfs-1.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +git_lfs-1.6.dist-info/METADATA,sha256=R8JeXdM_tbNNzy23Sw992NKPOV3NXjkrUzCcGMqVT6Y,1174 +git_lfs-1.6.dist-info/RECORD,, +git_lfs-1.6.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +git_lfs-1.6.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110 +git_lfs-1.6.dist-info/top_level.txt,sha256=sfhnq_PjKyFts61t6Ts6ssAYa2uj0KaCnjWWpCc2ElI,8 +git_lfs/__init__.py,sha256=wxf1h0HCxlTYd0Ui_ieYgjVFLVCrl9aVpi-cYqta6kw,8688 +git_lfs/__main__.py,sha256=Imn6ZITlrbp9EyuR_zkN8BN8ZMWDHU6cusJ1_RG5k-0,437 +git_lfs/__pycache__/__init__.cpython-313.pyc,, +git_lfs/__pycache__/__main__.cpython-313.pyc,, +git_lfs/__pycache__/utils.cpython-313.pyc,, +git_lfs/utils.py,sha256=4b11S10mPHITmvKpAriLVHvcZyqYZk4oYORpvKawmM0,1216 diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/REQUESTED b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/WHEEL b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..ef99c6cf3283b50a273ac4c6d009a0aa85597070 --- /dev/null +++ b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.34.2) +Root-Is-Purelib: true +Tag: py2-none-any +Tag: py3-none-any + diff --git a/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/top_level.txt b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8a2b2e483b41e3c0623a6468e0c9da4c8349c70 --- /dev/null +++ b/env/lib/python3.13/site-packages/git_lfs-1.6.dist-info/top_level.txt @@ -0,0 +1 @@ +git_lfs diff --git a/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/INSTALLER b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/METADATA b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..8a2f639061cc4a203f7109d8335d28076442c61d --- /dev/null +++ b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/METADATA @@ -0,0 +1,202 @@ +Metadata-Version: 2.4 +Name: h11 +Version: 0.16.0 +Summary: A pure-Python, bring-your-own-I/O implementation of HTTP/1.1 +Home-page: https://github.com/python-hyper/h11 +Author: Nathaniel J. Smith +Author-email: njs@pobox.com +License: MIT +Classifier: Development Status :: 3 - Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: System :: Networking +Requires-Python: >=3.8 +License-File: LICENSE.txt +Dynamic: author +Dynamic: author-email +Dynamic: classifier +Dynamic: description +Dynamic: home-page +Dynamic: license +Dynamic: license-file +Dynamic: requires-python +Dynamic: summary + +h11 +=== + +.. image:: https://travis-ci.org/python-hyper/h11.svg?branch=master + :target: https://travis-ci.org/python-hyper/h11 + :alt: Automated test status + +.. image:: https://codecov.io/gh/python-hyper/h11/branch/master/graph/badge.svg + :target: https://codecov.io/gh/python-hyper/h11 + :alt: Test coverage + +.. image:: https://readthedocs.org/projects/h11/badge/?version=latest + :target: http://h11.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +This is a little HTTP/1.1 library written from scratch in Python, +heavily inspired by `hyper-h2 `_. + +It's a "bring-your-own-I/O" library; h11 contains no IO code +whatsoever. This means you can hook h11 up to your favorite network +API, and that could be anything you want: synchronous, threaded, +asynchronous, or your own implementation of `RFC 6214 +`_ -- h11 won't judge you. +(Compare this to the current state of the art, where every time a `new +network API `_ comes along then someone +gets to start over reimplementing the entire HTTP protocol from +scratch.) Cory Benfield made an `excellent blog post describing the +benefits of this approach +`_, or if you like video +then here's his `PyCon 2016 talk on the same theme +`_. + +This also means that h11 is not immediately useful out of the box: +it's a toolkit for building programs that speak HTTP, not something +that could directly replace ``requests`` or ``twisted.web`` or +whatever. But h11 makes it much easier to implement something like +``requests`` or ``twisted.web``. + +At a high level, working with h11 goes like this: + +1) First, create an ``h11.Connection`` object to track the state of a + single HTTP/1.1 connection. + +2) When you read data off the network, pass it to + ``conn.receive_data(...)``; you'll get back a list of objects + representing high-level HTTP "events". + +3) When you want to send a high-level HTTP event, create the + corresponding "event" object and pass it to ``conn.send(...)``; + this will give you back some bytes that you can then push out + through the network. + +For example, a client might instantiate and then send a +``h11.Request`` object, then zero or more ``h11.Data`` objects for the +request body (e.g., if this is a POST), and then a +``h11.EndOfMessage`` to indicate the end of the message. Then the +server would then send back a ``h11.Response``, some ``h11.Data``, and +its own ``h11.EndOfMessage``. If either side violates the protocol, +you'll get a ``h11.ProtocolError`` exception. + +h11 is suitable for implementing both servers and clients, and has a +pleasantly symmetric API: the events you send as a client are exactly +the ones that you receive as a server and vice-versa. + +`Here's an example of a tiny HTTP client +`_ + +It also has `a fine manual `_. + +FAQ +--- + +*Whyyyyy?* + +I wanted to play with HTTP in `Curio +`__ and `Trio +`__, which at the time didn't have any +HTTP libraries. So I thought, no big deal, Python has, like, a dozen +different implementations of HTTP, surely I can find one that's +reusable. I didn't find one, but I did find Cory's call-to-arms +blog-post. So I figured, well, fine, if I have to implement HTTP from +scratch, at least I can make sure no-one *else* has to ever again. + +*Should I use it?* + +Maybe. You should be aware that it's a very young project. But, it's +feature complete and has an exhaustive test-suite and complete docs, +so the next step is for people to try using it and see how it goes +:-). If you do then please let us know -- if nothing else we'll want +to talk to you before making any incompatible changes! + +*What are the features/limitations?* + +Roughly speaking, it's trying to be a robust, complete, and non-hacky +implementation of the first "chapter" of the HTTP/1.1 spec: `RFC 7230: +HTTP/1.1 Message Syntax and Routing +`_. That is, it mostly focuses on +implementing HTTP at the level of taking bytes on and off the wire, +and the headers related to that, and tries to be anal about spec +conformance. It doesn't know about higher-level concerns like URL +routing, conditional GETs, cross-origin cookie policies, or content +negotiation. But it does know how to take care of framing, +cross-version differences in keep-alive handling, and the "obsolete +line folding" rule, so you can focus your energies on the hard / +interesting parts for your application, and it tries to support the +full specification in the sense that any useful HTTP/1.1 conformant +application should be able to use h11. + +It's pure Python, and has no dependencies outside of the standard +library. + +It has a test suite with 100.0% coverage for both statements and +branches. + +Currently it supports Python 3 (testing on 3.8-3.12) and PyPy 3. +The last Python 2-compatible version was h11 0.11.x. +(Originally it had a Cython wrapper for `http-parser +`_ and a beautiful nested state +machine implemented with ``yield from`` to postprocess the output. But +I had to take these out -- the new *parser* needs fewer lines-of-code +than the old *parser wrapper*, is written in pure Python, uses no +exotic language syntax, and has more features. It's sad, really; that +old state machine was really slick. I just need a few sentences here +to mourn that.) + +I don't know how fast it is. I haven't benchmarked or profiled it yet, +so it's probably got a few pointless hot spots, and I've been trying +to err on the side of simplicity and robustness instead of +micro-optimization. But at the architectural level I tried hard to +avoid fundamentally bad decisions, e.g., I believe that all the +parsing algorithms remain linear-time even in the face of pathological +input like slowloris, and there are no byte-by-byte loops. (I also +believe that it maintains bounded memory usage in the face of +arbitrary/pathological input.) + +The whole library is ~800 lines-of-code. You can read and understand +the whole thing in less than an hour. Most of the energy invested in +this so far has been spent on trying to keep things simple by +minimizing special-cases and ad hoc state manipulation; even though it +is now quite small and simple, I'm still annoyed that I haven't +figured out how to make it even smaller and simpler. (Unfortunately, +HTTP does not lend itself to simplicity.) + +The API is ~feature complete and I don't expect the general outlines +to change much, but you can't judge an API's ergonomics until you +actually document and use it, so I'd expect some changes in the +details. + +*How do I try it?* + +.. code-block:: sh + + $ pip install h11 + $ git clone git@github.com:python-hyper/h11 + $ cd h11/examples + $ python basic-client.py + +and go from there. + +*License?* + +MIT + +*Code of conduct?* + +Contributors are requested to follow our `code of conduct +`_ in +all project spaces. diff --git a/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/RECORD b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..dee4864da26e11320bae425074d7c1b64dac1f95 --- /dev/null +++ b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/RECORD @@ -0,0 +1,29 @@ +h11-0.16.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +h11-0.16.0.dist-info/METADATA,sha256=KPMmCYrAn8unm48YD5YIfIQf4kViFct7hyqcfVzRnWQ,8348 +h11-0.16.0.dist-info/RECORD,, +h11-0.16.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91 +h11-0.16.0.dist-info/licenses/LICENSE.txt,sha256=N9tbuFkm2yikJ6JYZ_ELEjIAOuob5pzLhRE4rbjm82E,1124 +h11-0.16.0.dist-info/top_level.txt,sha256=F7dC4jl3zeh8TGHEPaWJrMbeuoWbS379Gwdi-Yvdcis,4 +h11/__init__.py,sha256=iO1KzkSO42yZ6ffg-VMgbx_ZVTWGUY00nRYEWn-s3kY,1507 +h11/__pycache__/__init__.cpython-313.pyc,, +h11/__pycache__/_abnf.cpython-313.pyc,, +h11/__pycache__/_connection.cpython-313.pyc,, +h11/__pycache__/_events.cpython-313.pyc,, +h11/__pycache__/_headers.cpython-313.pyc,, +h11/__pycache__/_readers.cpython-313.pyc,, +h11/__pycache__/_receivebuffer.cpython-313.pyc,, +h11/__pycache__/_state.cpython-313.pyc,, +h11/__pycache__/_util.cpython-313.pyc,, +h11/__pycache__/_version.cpython-313.pyc,, +h11/__pycache__/_writers.cpython-313.pyc,, +h11/_abnf.py,sha256=ybixr0xsupnkA6GFAyMubuXF6Tc1lb_hF890NgCsfNc,4815 +h11/_connection.py,sha256=k9YRVf6koZqbttBW36xSWaJpWdZwa-xQVU9AHEo9DuI,26863 +h11/_events.py,sha256=I97aXoal1Wu7dkL548BANBUCkOIbe-x5CioYA9IBY14,11792 +h11/_headers.py,sha256=P7D-lBNxHwdLZPLimmYwrPG-9ZkjElvvJZJdZAgSP-4,10412 +h11/_readers.py,sha256=a4RypORUCC3d0q_kxPuBIM7jTD8iLt5X91TH0FsduN4,8590 +h11/_receivebuffer.py,sha256=xrspsdsNgWFxRfQcTXxR8RrdjRXXTK0Io5cQYWpJ1Ws,5252 +h11/_state.py,sha256=_5LG_BGR8FCcFQeBPH-TMHgm_-B-EUcWCnQof_9XjFE,13231 +h11/_util.py,sha256=LWkkjXyJaFlAy6Lt39w73UStklFT5ovcvo0TkY7RYuk,4888 +h11/_version.py,sha256=GVSsbPSPDcOuF6ptfIiXnVJoaEm3ygXbMnqlr_Giahw,686 +h11/_writers.py,sha256=oFKm6PtjeHfbj4RLX7VB7KDc1gIY53gXG3_HR9ltmTA,5081 +h11/py.typed,sha256=sow9soTwP9T_gEAQSVh7Gb8855h04Nwmhs2We-JRgZM,7 diff --git a/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/WHEEL b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..1eb3c49d99559863120cfb8433fc8738fba43ba9 --- /dev/null +++ b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (78.1.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/top_level.txt b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d24def711344ec6f4da2108f7d5c9261eb35f8b --- /dev/null +++ b/env/lib/python3.13/site-packages/h11-0.16.0.dist-info/top_level.txt @@ -0,0 +1 @@ +h11 diff --git a/env/lib/python3.13/site-packages/httpcore/__init__.py b/env/lib/python3.13/site-packages/httpcore/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9a92dc4a440bdf6f259ec1083c89c817eb7b631b --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/__init__.py @@ -0,0 +1,141 @@ +from ._api import request, stream +from ._async import ( + AsyncConnectionInterface, + AsyncConnectionPool, + AsyncHTTP2Connection, + AsyncHTTP11Connection, + AsyncHTTPConnection, + AsyncHTTPProxy, + AsyncSOCKSProxy, +) +from ._backends.base import ( + SOCKET_OPTION, + AsyncNetworkBackend, + AsyncNetworkStream, + NetworkBackend, + NetworkStream, +) +from ._backends.mock import AsyncMockBackend, AsyncMockStream, MockBackend, MockStream +from ._backends.sync import SyncBackend +from ._exceptions import ( + ConnectError, + ConnectionNotAvailable, + ConnectTimeout, + LocalProtocolError, + NetworkError, + PoolTimeout, + ProtocolError, + ProxyError, + ReadError, + ReadTimeout, + RemoteProtocolError, + TimeoutException, + UnsupportedProtocol, + WriteError, + WriteTimeout, +) +from ._models import URL, Origin, Proxy, Request, Response +from ._ssl import default_ssl_context +from ._sync import ( + ConnectionInterface, + ConnectionPool, + HTTP2Connection, + HTTP11Connection, + HTTPConnection, + HTTPProxy, + SOCKSProxy, +) + +# The 'httpcore.AnyIOBackend' class is conditional on 'anyio' being installed. +try: + from ._backends.anyio import AnyIOBackend +except ImportError: # pragma: nocover + + class AnyIOBackend: # type: ignore + def __init__(self, *args, **kwargs): # type: ignore + msg = ( + "Attempted to use 'httpcore.AnyIOBackend' but 'anyio' is not installed." + ) + raise RuntimeError(msg) + + +# The 'httpcore.TrioBackend' class is conditional on 'trio' being installed. +try: + from ._backends.trio import TrioBackend +except ImportError: # pragma: nocover + + class TrioBackend: # type: ignore + def __init__(self, *args, **kwargs): # type: ignore + msg = "Attempted to use 'httpcore.TrioBackend' but 'trio' is not installed." + raise RuntimeError(msg) + + +__all__ = [ + # top-level requests + "request", + "stream", + # models + "Origin", + "URL", + "Request", + "Response", + "Proxy", + # async + "AsyncHTTPConnection", + "AsyncConnectionPool", + "AsyncHTTPProxy", + "AsyncHTTP11Connection", + "AsyncHTTP2Connection", + "AsyncConnectionInterface", + "AsyncSOCKSProxy", + # sync + "HTTPConnection", + "ConnectionPool", + "HTTPProxy", + "HTTP11Connection", + "HTTP2Connection", + "ConnectionInterface", + "SOCKSProxy", + # network backends, implementations + "SyncBackend", + "AnyIOBackend", + "TrioBackend", + # network backends, mock implementations + "AsyncMockBackend", + "AsyncMockStream", + "MockBackend", + "MockStream", + # network backends, interface + "AsyncNetworkStream", + "AsyncNetworkBackend", + "NetworkStream", + "NetworkBackend", + # util + "default_ssl_context", + "SOCKET_OPTION", + # exceptions + "ConnectionNotAvailable", + "ProxyError", + "ProtocolError", + "LocalProtocolError", + "RemoteProtocolError", + "UnsupportedProtocol", + "TimeoutException", + "PoolTimeout", + "ConnectTimeout", + "ReadTimeout", + "WriteTimeout", + "NetworkError", + "ConnectError", + "ReadError", + "WriteError", +] + +__version__ = "1.0.9" + + +__locals = locals() +for __name in __all__: + # Exclude SOCKET_OPTION, it causes AttributeError on Python 3.14 + if not __name.startswith(("__", "SOCKET_OPTION")): + setattr(__locals[__name], "__module__", "httpcore") # noqa diff --git a/env/lib/python3.13/site-packages/httpcore/_api.py b/env/lib/python3.13/site-packages/httpcore/_api.py new file mode 100644 index 0000000000000000000000000000000000000000..38b961d10de88bebc98c758d0d1f14af1e7c0370 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_api.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import contextlib +import typing + +from ._models import URL, Extensions, HeaderTypes, Response +from ._sync.connection_pool import ConnectionPool + + +def request( + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.Iterator[bytes] | None = None, + extensions: Extensions | None = None, +) -> Response: + """ + Sends an HTTP request, returning the response. + + ``` + response = httpcore.request("GET", "https://www.example.com/") + ``` + + Arguments: + method: The HTTP method for the request. Typically one of `"GET"`, + `"OPTIONS"`, `"HEAD"`, `"POST"`, `"PUT"`, `"PATCH"`, or `"DELETE"`. + url: The URL of the HTTP request. Either as an instance of `httpcore.URL`, + or as str/bytes. + headers: The HTTP request headers. Either as a dictionary of str/bytes, + or as a list of two-tuples of str/bytes. + content: The content of the request body. Either as bytes, + or as a bytes iterator. + extensions: A dictionary of optional extra information included on the request. + Possible keys include `"timeout"`. + + Returns: + An instance of `httpcore.Response`. + """ + with ConnectionPool() as pool: + return pool.request( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) + + +@contextlib.contextmanager +def stream( + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.Iterator[bytes] | None = None, + extensions: Extensions | None = None, +) -> typing.Iterator[Response]: + """ + Sends an HTTP request, returning the response within a content manager. + + ``` + with httpcore.stream("GET", "https://www.example.com/") as response: + ... + ``` + + When using the `stream()` function, the body of the response will not be + automatically read. If you want to access the response body you should + either use `content = response.read()`, or `for chunk in response.iter_content()`. + + Arguments: + method: The HTTP method for the request. Typically one of `"GET"`, + `"OPTIONS"`, `"HEAD"`, `"POST"`, `"PUT"`, `"PATCH"`, or `"DELETE"`. + url: The URL of the HTTP request. Either as an instance of `httpcore.URL`, + or as str/bytes. + headers: The HTTP request headers. Either as a dictionary of str/bytes, + or as a list of two-tuples of str/bytes. + content: The content of the request body. Either as bytes, + or as a bytes iterator. + extensions: A dictionary of optional extra information included on the request. + Possible keys include `"timeout"`. + + Returns: + An instance of `httpcore.Response`. + """ + with ConnectionPool() as pool: + with pool.stream( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) as response: + yield response diff --git a/env/lib/python3.13/site-packages/httpcore/_exceptions.py b/env/lib/python3.13/site-packages/httpcore/_exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..bc28d44f55bdc4b872951a74780469a3999d9ab4 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_exceptions.py @@ -0,0 +1,81 @@ +import contextlib +import typing + +ExceptionMapping = typing.Mapping[typing.Type[Exception], typing.Type[Exception]] + + +@contextlib.contextmanager +def map_exceptions(map: ExceptionMapping) -> typing.Iterator[None]: + try: + yield + except Exception as exc: # noqa: PIE786 + for from_exc, to_exc in map.items(): + if isinstance(exc, from_exc): + raise to_exc(exc) from exc + raise # pragma: nocover + + +class ConnectionNotAvailable(Exception): + pass + + +class ProxyError(Exception): + pass + + +class UnsupportedProtocol(Exception): + pass + + +class ProtocolError(Exception): + pass + + +class RemoteProtocolError(ProtocolError): + pass + + +class LocalProtocolError(ProtocolError): + pass + + +# Timeout errors + + +class TimeoutException(Exception): + pass + + +class PoolTimeout(TimeoutException): + pass + + +class ConnectTimeout(TimeoutException): + pass + + +class ReadTimeout(TimeoutException): + pass + + +class WriteTimeout(TimeoutException): + pass + + +# Network errors + + +class NetworkError(Exception): + pass + + +class ConnectError(NetworkError): + pass + + +class ReadError(NetworkError): + pass + + +class WriteError(NetworkError): + pass diff --git a/env/lib/python3.13/site-packages/httpcore/_models.py b/env/lib/python3.13/site-packages/httpcore/_models.py new file mode 100644 index 0000000000000000000000000000000000000000..8a65f13347d6621289a166d08123cbc8e1ad0157 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_models.py @@ -0,0 +1,516 @@ +from __future__ import annotations + +import base64 +import ssl +import typing +import urllib.parse + +# Functions for typechecking... + + +ByteOrStr = typing.Union[bytes, str] +HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]] +HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr] +HeaderTypes = typing.Union[HeadersAsSequence, HeadersAsMapping, None] + +Extensions = typing.MutableMapping[str, typing.Any] + + +def enforce_bytes(value: bytes | str, *, name: str) -> bytes: + """ + Any arguments that are ultimately represented as bytes can be specified + either as bytes or as strings. + + However we enforce that any string arguments must only contain characters in + the plain ASCII range. chr(0)...chr(127). If you need to use characters + outside that range then be precise, and use a byte-wise argument. + """ + if isinstance(value, str): + try: + return value.encode("ascii") + except UnicodeEncodeError: + raise TypeError(f"{name} strings may not include unicode characters.") + elif isinstance(value, bytes): + return value + + seen_type = type(value).__name__ + raise TypeError(f"{name} must be bytes or str, but got {seen_type}.") + + +def enforce_url(value: URL | bytes | str, *, name: str) -> URL: + """ + Type check for URL parameters. + """ + if isinstance(value, (bytes, str)): + return URL(value) + elif isinstance(value, URL): + return value + + seen_type = type(value).__name__ + raise TypeError(f"{name} must be a URL, bytes, or str, but got {seen_type}.") + + +def enforce_headers( + value: HeadersAsMapping | HeadersAsSequence | None = None, *, name: str +) -> list[tuple[bytes, bytes]]: + """ + Convienence function that ensure all items in request or response headers + are either bytes or strings in the plain ASCII range. + """ + if value is None: + return [] + elif isinstance(value, typing.Mapping): + return [ + ( + enforce_bytes(k, name="header name"), + enforce_bytes(v, name="header value"), + ) + for k, v in value.items() + ] + elif isinstance(value, typing.Sequence): + return [ + ( + enforce_bytes(k, name="header name"), + enforce_bytes(v, name="header value"), + ) + for k, v in value + ] + + seen_type = type(value).__name__ + raise TypeError( + f"{name} must be a mapping or sequence of two-tuples, but got {seen_type}." + ) + + +def enforce_stream( + value: bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes] | None, + *, + name: str, +) -> typing.Iterable[bytes] | typing.AsyncIterable[bytes]: + if value is None: + return ByteStream(b"") + elif isinstance(value, bytes): + return ByteStream(value) + return value + + +# * https://tools.ietf.org/html/rfc3986#section-3.2.3 +# * https://url.spec.whatwg.org/#url-miscellaneous +# * https://url.spec.whatwg.org/#scheme-state +DEFAULT_PORTS = { + b"ftp": 21, + b"http": 80, + b"https": 443, + b"ws": 80, + b"wss": 443, +} + + +def include_request_headers( + headers: list[tuple[bytes, bytes]], + *, + url: "URL", + content: None | bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes], +) -> list[tuple[bytes, bytes]]: + headers_set = set(k.lower() for k, v in headers) + + if b"host" not in headers_set: + default_port = DEFAULT_PORTS.get(url.scheme) + if url.port is None or url.port == default_port: + header_value = url.host + else: + header_value = b"%b:%d" % (url.host, url.port) + headers = [(b"Host", header_value)] + headers + + if ( + content is not None + and b"content-length" not in headers_set + and b"transfer-encoding" not in headers_set + ): + if isinstance(content, bytes): + content_length = str(len(content)).encode("ascii") + headers += [(b"Content-Length", content_length)] + else: + headers += [(b"Transfer-Encoding", b"chunked")] # pragma: nocover + + return headers + + +# Interfaces for byte streams... + + +class ByteStream: + """ + A container for non-streaming content, and that supports both sync and async + stream iteration. + """ + + def __init__(self, content: bytes) -> None: + self._content = content + + def __iter__(self) -> typing.Iterator[bytes]: + yield self._content + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + yield self._content + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{len(self._content)} bytes]>" + + +class Origin: + def __init__(self, scheme: bytes, host: bytes, port: int) -> None: + self.scheme = scheme + self.host = host + self.port = port + + def __eq__(self, other: typing.Any) -> bool: + return ( + isinstance(other, Origin) + and self.scheme == other.scheme + and self.host == other.host + and self.port == other.port + ) + + def __str__(self) -> str: + scheme = self.scheme.decode("ascii") + host = self.host.decode("ascii") + port = str(self.port) + return f"{scheme}://{host}:{port}" + + +class URL: + """ + Represents the URL against which an HTTP request may be made. + + The URL may either be specified as a plain string, for convienence: + + ```python + url = httpcore.URL("https://www.example.com/") + ``` + + Or be constructed with explicitily pre-parsed components: + + ```python + url = httpcore.URL(scheme=b'https', host=b'www.example.com', port=None, target=b'/') + ``` + + Using this second more explicit style allows integrations that are using + `httpcore` to pass through URLs that have already been parsed in order to use + libraries such as `rfc-3986` rather than relying on the stdlib. It also ensures + that URL parsing is treated identically at both the networking level and at any + higher layers of abstraction. + + The four components are important here, as they allow the URL to be precisely + specified in a pre-parsed format. They also allow certain types of request to + be created that could not otherwise be expressed. + + For example, an HTTP request to `http://www.example.com/` forwarded via a proxy + at `http://localhost:8080`... + + ```python + # Constructs an HTTP request with a complete URL as the target: + # GET https://www.example.com/ HTTP/1.1 + url = httpcore.URL( + scheme=b'http', + host=b'localhost', + port=8080, + target=b'https://www.example.com/' + ) + request = httpcore.Request( + method="GET", + url=url + ) + ``` + + Another example is constructing an `OPTIONS *` request... + + ```python + # Constructs an 'OPTIONS *' HTTP request: + # OPTIONS * HTTP/1.1 + url = httpcore.URL(scheme=b'https', host=b'www.example.com', target=b'*') + request = httpcore.Request(method="OPTIONS", url=url) + ``` + + This kind of request is not possible to formulate with a URL string, + because the `/` delimiter is always used to demark the target from the + host/port portion of the URL. + + For convenience, string-like arguments may be specified either as strings or + as bytes. However, once a request is being issue over-the-wire, the URL + components are always ultimately required to be a bytewise representation. + + In order to avoid any ambiguity over character encodings, when strings are used + as arguments, they must be strictly limited to the ASCII range `chr(0)`-`chr(127)`. + If you require a bytewise representation that is outside this range you must + handle the character encoding directly, and pass a bytes instance. + """ + + def __init__( + self, + url: bytes | str = "", + *, + scheme: bytes | str = b"", + host: bytes | str = b"", + port: int | None = None, + target: bytes | str = b"", + ) -> None: + """ + Parameters: + url: The complete URL as a string or bytes. + scheme: The URL scheme as a string or bytes. + Typically either `"http"` or `"https"`. + host: The URL host as a string or bytes. Such as `"www.example.com"`. + port: The port to connect to. Either an integer or `None`. + target: The target of the HTTP request. Such as `"/items?search=red"`. + """ + if url: + parsed = urllib.parse.urlparse(enforce_bytes(url, name="url")) + self.scheme = parsed.scheme + self.host = parsed.hostname or b"" + self.port = parsed.port + self.target = (parsed.path or b"/") + ( + b"?" + parsed.query if parsed.query else b"" + ) + else: + self.scheme = enforce_bytes(scheme, name="scheme") + self.host = enforce_bytes(host, name="host") + self.port = port + self.target = enforce_bytes(target, name="target") + + @property + def origin(self) -> Origin: + default_port = { + b"http": 80, + b"https": 443, + b"ws": 80, + b"wss": 443, + b"socks5": 1080, + b"socks5h": 1080, + }[self.scheme] + return Origin( + scheme=self.scheme, host=self.host, port=self.port or default_port + ) + + def __eq__(self, other: typing.Any) -> bool: + return ( + isinstance(other, URL) + and other.scheme == self.scheme + and other.host == self.host + and other.port == self.port + and other.target == self.target + ) + + def __bytes__(self) -> bytes: + if self.port is None: + return b"%b://%b%b" % (self.scheme, self.host, self.target) + return b"%b://%b:%d%b" % (self.scheme, self.host, self.port, self.target) + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(scheme={self.scheme!r}, " + f"host={self.host!r}, port={self.port!r}, target={self.target!r})" + ) + + +class Request: + """ + An HTTP request. + """ + + def __init__( + self, + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes + | typing.Iterable[bytes] + | typing.AsyncIterable[bytes] + | None = None, + extensions: Extensions | None = None, + ) -> None: + """ + Parameters: + method: The HTTP request method, either as a string or bytes. + For example: `GET`. + url: The request URL, either as a `URL` instance, or as a string or bytes. + For example: `"https://www.example.com".` + headers: The HTTP request headers. + content: The content of the request body. + extensions: A dictionary of optional extra information included on + the request. Possible keys include `"timeout"`, and `"trace"`. + """ + self.method: bytes = enforce_bytes(method, name="method") + self.url: URL = enforce_url(url, name="url") + self.headers: list[tuple[bytes, bytes]] = enforce_headers( + headers, name="headers" + ) + self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = ( + enforce_stream(content, name="content") + ) + self.extensions = {} if extensions is None else extensions + + if "target" in self.extensions: + self.url = URL( + scheme=self.url.scheme, + host=self.url.host, + port=self.url.port, + target=self.extensions["target"], + ) + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.method!r}]>" + + +class Response: + """ + An HTTP response. + """ + + def __init__( + self, + status: int, + *, + headers: HeaderTypes = None, + content: bytes + | typing.Iterable[bytes] + | typing.AsyncIterable[bytes] + | None = None, + extensions: Extensions | None = None, + ) -> None: + """ + Parameters: + status: The HTTP status code of the response. For example `200`. + headers: The HTTP response headers. + content: The content of the response body. + extensions: A dictionary of optional extra information included on + the responseself.Possible keys include `"http_version"`, + `"reason_phrase"`, and `"network_stream"`. + """ + self.status: int = status + self.headers: list[tuple[bytes, bytes]] = enforce_headers( + headers, name="headers" + ) + self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = ( + enforce_stream(content, name="content") + ) + self.extensions = {} if extensions is None else extensions + + self._stream_consumed = False + + @property + def content(self) -> bytes: + if not hasattr(self, "_content"): + if isinstance(self.stream, typing.Iterable): + raise RuntimeError( + "Attempted to access 'response.content' on a streaming response. " + "Call 'response.read()' first." + ) + else: + raise RuntimeError( + "Attempted to access 'response.content' on a streaming response. " + "Call 'await response.aread()' first." + ) + return self._content + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.status}]>" + + # Sync interface... + + def read(self) -> bytes: + if not isinstance(self.stream, typing.Iterable): # pragma: nocover + raise RuntimeError( + "Attempted to read an asynchronous response using 'response.read()'. " + "You should use 'await response.aread()' instead." + ) + if not hasattr(self, "_content"): + self._content = b"".join([part for part in self.iter_stream()]) + return self._content + + def iter_stream(self) -> typing.Iterator[bytes]: + if not isinstance(self.stream, typing.Iterable): # pragma: nocover + raise RuntimeError( + "Attempted to stream an asynchronous response using 'for ... in " + "response.iter_stream()'. " + "You should use 'async for ... in response.aiter_stream()' instead." + ) + if self._stream_consumed: + raise RuntimeError( + "Attempted to call 'for ... in response.iter_stream()' more than once." + ) + self._stream_consumed = True + for chunk in self.stream: + yield chunk + + def close(self) -> None: + if not isinstance(self.stream, typing.Iterable): # pragma: nocover + raise RuntimeError( + "Attempted to close an asynchronous response using 'response.close()'. " + "You should use 'await response.aclose()' instead." + ) + if hasattr(self.stream, "close"): + self.stream.close() + + # Async interface... + + async def aread(self) -> bytes: + if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover + raise RuntimeError( + "Attempted to read an synchronous response using " + "'await response.aread()'. " + "You should use 'response.read()' instead." + ) + if not hasattr(self, "_content"): + self._content = b"".join([part async for part in self.aiter_stream()]) + return self._content + + async def aiter_stream(self) -> typing.AsyncIterator[bytes]: + if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover + raise RuntimeError( + "Attempted to stream an synchronous response using 'async for ... in " + "response.aiter_stream()'. " + "You should use 'for ... in response.iter_stream()' instead." + ) + if self._stream_consumed: + raise RuntimeError( + "Attempted to call 'async for ... in response.aiter_stream()' " + "more than once." + ) + self._stream_consumed = True + async for chunk in self.stream: + yield chunk + + async def aclose(self) -> None: + if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover + raise RuntimeError( + "Attempted to close a synchronous response using " + "'await response.aclose()'. " + "You should use 'response.close()' instead." + ) + if hasattr(self.stream, "aclose"): + await self.stream.aclose() + + +class Proxy: + def __init__( + self, + url: URL | bytes | str, + auth: tuple[bytes | str, bytes | str] | None = None, + headers: HeadersAsMapping | HeadersAsSequence | None = None, + ssl_context: ssl.SSLContext | None = None, + ): + self.url = enforce_url(url, name="url") + self.headers = enforce_headers(headers, name="headers") + self.ssl_context = ssl_context + + if auth is not None: + username = enforce_bytes(auth[0], name="auth") + password = enforce_bytes(auth[1], name="auth") + userpass = username + b":" + password + authorization = b"Basic " + base64.b64encode(userpass) + self.auth: tuple[bytes, bytes] | None = (username, password) + self.headers = [(b"Proxy-Authorization", authorization)] + self.headers + else: + self.auth = None diff --git a/env/lib/python3.13/site-packages/httpcore/_ssl.py b/env/lib/python3.13/site-packages/httpcore/_ssl.py new file mode 100644 index 0000000000000000000000000000000000000000..c99c5a67945b8a3a3544d481e979c791ab45fe23 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_ssl.py @@ -0,0 +1,9 @@ +import ssl + +import certifi + + +def default_ssl_context() -> ssl.SSLContext: + context = ssl.create_default_context() + context.load_verify_locations(certifi.where()) + return context diff --git a/env/lib/python3.13/site-packages/httpcore/_synchronization.py b/env/lib/python3.13/site-packages/httpcore/_synchronization.py new file mode 100644 index 0000000000000000000000000000000000000000..2ecc9e9c363e2f16c4f934cf41cf871826d6a495 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_synchronization.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import threading +import types + +from ._exceptions import ExceptionMapping, PoolTimeout, map_exceptions + +# Our async synchronization primatives use either 'anyio' or 'trio' depending +# on if they're running under asyncio or trio. + +try: + import trio +except (ImportError, NotImplementedError): # pragma: nocover + trio = None # type: ignore + +try: + import anyio +except ImportError: # pragma: nocover + anyio = None # type: ignore + + +def current_async_library() -> str: + # Determine if we're running under trio or asyncio. + # See https://sniffio.readthedocs.io/en/latest/ + try: + import sniffio + except ImportError: # pragma: nocover + environment = "asyncio" + else: + environment = sniffio.current_async_library() + + if environment not in ("asyncio", "trio"): # pragma: nocover + raise RuntimeError("Running under an unsupported async environment.") + + if environment == "asyncio" and anyio is None: # pragma: nocover + raise RuntimeError( + "Running with asyncio requires installation of 'httpcore[asyncio]'." + ) + + if environment == "trio" and trio is None: # pragma: nocover + raise RuntimeError( + "Running with trio requires installation of 'httpcore[trio]'." + ) + + return environment + + +class AsyncLock: + """ + This is a standard lock. + + In the sync case `Lock` provides thread locking. + In the async case `AsyncLock` provides async locking. + """ + + def __init__(self) -> None: + self._backend = "" + + def setup(self) -> None: + """ + Detect if we're running under 'asyncio' or 'trio' and create + a lock with the correct implementation. + """ + self._backend = current_async_library() + if self._backend == "trio": + self._trio_lock = trio.Lock() + elif self._backend == "asyncio": + self._anyio_lock = anyio.Lock() + + async def __aenter__(self) -> AsyncLock: + if not self._backend: + self.setup() + + if self._backend == "trio": + await self._trio_lock.acquire() + elif self._backend == "asyncio": + await self._anyio_lock.acquire() + + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + if self._backend == "trio": + self._trio_lock.release() + elif self._backend == "asyncio": + self._anyio_lock.release() + + +class AsyncThreadLock: + """ + This is a threading-only lock for no-I/O contexts. + + In the sync case `ThreadLock` provides thread locking. + In the async case `AsyncThreadLock` is a no-op. + """ + + def __enter__(self) -> AsyncThreadLock: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + pass + + +class AsyncEvent: + def __init__(self) -> None: + self._backend = "" + + def setup(self) -> None: + """ + Detect if we're running under 'asyncio' or 'trio' and create + a lock with the correct implementation. + """ + self._backend = current_async_library() + if self._backend == "trio": + self._trio_event = trio.Event() + elif self._backend == "asyncio": + self._anyio_event = anyio.Event() + + def set(self) -> None: + if not self._backend: + self.setup() + + if self._backend == "trio": + self._trio_event.set() + elif self._backend == "asyncio": + self._anyio_event.set() + + async def wait(self, timeout: float | None = None) -> None: + if not self._backend: + self.setup() + + if self._backend == "trio": + trio_exc_map: ExceptionMapping = {trio.TooSlowError: PoolTimeout} + timeout_or_inf = float("inf") if timeout is None else timeout + with map_exceptions(trio_exc_map): + with trio.fail_after(timeout_or_inf): + await self._trio_event.wait() + elif self._backend == "asyncio": + anyio_exc_map: ExceptionMapping = {TimeoutError: PoolTimeout} + with map_exceptions(anyio_exc_map): + with anyio.fail_after(timeout): + await self._anyio_event.wait() + + +class AsyncSemaphore: + def __init__(self, bound: int) -> None: + self._bound = bound + self._backend = "" + + def setup(self) -> None: + """ + Detect if we're running under 'asyncio' or 'trio' and create + a semaphore with the correct implementation. + """ + self._backend = current_async_library() + if self._backend == "trio": + self._trio_semaphore = trio.Semaphore( + initial_value=self._bound, max_value=self._bound + ) + elif self._backend == "asyncio": + self._anyio_semaphore = anyio.Semaphore( + initial_value=self._bound, max_value=self._bound + ) + + async def acquire(self) -> None: + if not self._backend: + self.setup() + + if self._backend == "trio": + await self._trio_semaphore.acquire() + elif self._backend == "asyncio": + await self._anyio_semaphore.acquire() + + async def release(self) -> None: + if self._backend == "trio": + self._trio_semaphore.release() + elif self._backend == "asyncio": + self._anyio_semaphore.release() + + +class AsyncShieldCancellation: + # For certain portions of our codebase where we're dealing with + # closing connections during exception handling we want to shield + # the operation from being cancelled. + # + # with AsyncShieldCancellation(): + # ... # clean-up operations, shielded from cancellation. + + def __init__(self) -> None: + """ + Detect if we're running under 'asyncio' or 'trio' and create + a shielded scope with the correct implementation. + """ + self._backend = current_async_library() + + if self._backend == "trio": + self._trio_shield = trio.CancelScope(shield=True) + elif self._backend == "asyncio": + self._anyio_shield = anyio.CancelScope(shield=True) + + def __enter__(self) -> AsyncShieldCancellation: + if self._backend == "trio": + self._trio_shield.__enter__() + elif self._backend == "asyncio": + self._anyio_shield.__enter__() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + if self._backend == "trio": + self._trio_shield.__exit__(exc_type, exc_value, traceback) + elif self._backend == "asyncio": + self._anyio_shield.__exit__(exc_type, exc_value, traceback) + + +# Our thread-based synchronization primitives... + + +class Lock: + """ + This is a standard lock. + + In the sync case `Lock` provides thread locking. + In the async case `AsyncLock` provides async locking. + """ + + def __init__(self) -> None: + self._lock = threading.Lock() + + def __enter__(self) -> Lock: + self._lock.acquire() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self._lock.release() + + +class ThreadLock: + """ + This is a threading-only lock for no-I/O contexts. + + In the sync case `ThreadLock` provides thread locking. + In the async case `AsyncThreadLock` is a no-op. + """ + + def __init__(self) -> None: + self._lock = threading.Lock() + + def __enter__(self) -> ThreadLock: + self._lock.acquire() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self._lock.release() + + +class Event: + def __init__(self) -> None: + self._event = threading.Event() + + def set(self) -> None: + self._event.set() + + def wait(self, timeout: float | None = None) -> None: + if timeout == float("inf"): # pragma: no cover + timeout = None + if not self._event.wait(timeout=timeout): + raise PoolTimeout() # pragma: nocover + + +class Semaphore: + def __init__(self, bound: int) -> None: + self._semaphore = threading.Semaphore(value=bound) + + def acquire(self) -> None: + self._semaphore.acquire() + + def release(self) -> None: + self._semaphore.release() + + +class ShieldCancellation: + # Thread-synchronous codebases don't support cancellation semantics. + # We have this class because we need to mirror the async and sync + # cases within our package, but it's just a no-op. + def __enter__(self) -> ShieldCancellation: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + pass diff --git a/env/lib/python3.13/site-packages/httpcore/_trace.py b/env/lib/python3.13/site-packages/httpcore/_trace.py new file mode 100644 index 0000000000000000000000000000000000000000..5f1cd7c47829ce17dbcf651ab56b4ffdce04a485 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_trace.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import inspect +import logging +import types +import typing + +from ._models import Request + + +class Trace: + def __init__( + self, + name: str, + logger: logging.Logger, + request: Request | None = None, + kwargs: dict[str, typing.Any] | None = None, + ) -> None: + self.name = name + self.logger = logger + self.trace_extension = ( + None if request is None else request.extensions.get("trace") + ) + self.debug = self.logger.isEnabledFor(logging.DEBUG) + self.kwargs = kwargs or {} + self.return_value: typing.Any = None + self.should_trace = self.debug or self.trace_extension is not None + self.prefix = self.logger.name.split(".")[-1] + + def trace(self, name: str, info: dict[str, typing.Any]) -> None: + if self.trace_extension is not None: + prefix_and_name = f"{self.prefix}.{name}" + ret = self.trace_extension(prefix_and_name, info) + if inspect.iscoroutine(ret): # pragma: no cover + raise TypeError( + "If you are using a synchronous interface, " + "the callback of the `trace` extension should " + "be a normal function instead of an asynchronous function." + ) + + if self.debug: + if not info or "return_value" in info and info["return_value"] is None: + message = name + else: + args = " ".join([f"{key}={value!r}" for key, value in info.items()]) + message = f"{name} {args}" + self.logger.debug(message) + + def __enter__(self) -> Trace: + if self.should_trace: + info = self.kwargs + self.trace(f"{self.name}.started", info) + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + if self.should_trace: + if exc_value is None: + info = {"return_value": self.return_value} + self.trace(f"{self.name}.complete", info) + else: + info = {"exception": exc_value} + self.trace(f"{self.name}.failed", info) + + async def atrace(self, name: str, info: dict[str, typing.Any]) -> None: + if self.trace_extension is not None: + prefix_and_name = f"{self.prefix}.{name}" + coro = self.trace_extension(prefix_and_name, info) + if not inspect.iscoroutine(coro): # pragma: no cover + raise TypeError( + "If you're using an asynchronous interface, " + "the callback of the `trace` extension should " + "be an asynchronous function rather than a normal function." + ) + await coro + + if self.debug: + if not info or "return_value" in info and info["return_value"] is None: + message = name + else: + args = " ".join([f"{key}={value!r}" for key, value in info.items()]) + message = f"{name} {args}" + self.logger.debug(message) + + async def __aenter__(self) -> Trace: + if self.should_trace: + info = self.kwargs + await self.atrace(f"{self.name}.started", info) + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + if self.should_trace: + if exc_value is None: + info = {"return_value": self.return_value} + await self.atrace(f"{self.name}.complete", info) + else: + info = {"exception": exc_value} + await self.atrace(f"{self.name}.failed", info) diff --git a/env/lib/python3.13/site-packages/httpcore/_utils.py b/env/lib/python3.13/site-packages/httpcore/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c44ff93cb2f572afc6e679308024b744b65c3b0a --- /dev/null +++ b/env/lib/python3.13/site-packages/httpcore/_utils.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import select +import socket +import sys + + +def is_socket_readable(sock: socket.socket | None) -> bool: + """ + Return whether a socket, as identifed by its file descriptor, is readable. + "A socket is readable" means that the read buffer isn't empty, i.e. that calling + .recv() on it would immediately return some data. + """ + # NOTE: we want check for readability without actually attempting to read, because + # we don't want to block forever if it's not readable. + + # In the case that the socket no longer exists, or cannot return a file + # descriptor, we treat it as being readable, as if it the next read operation + # on it is ready to return the terminating `b""`. + sock_fd = None if sock is None else sock.fileno() + if sock_fd is None or sock_fd < 0: # pragma: nocover + return True + + # The implementation below was stolen from: + # https://github.com/python-trio/trio/blob/20ee2b1b7376db637435d80e266212a35837ddcc/trio/_socket.py#L471-L478 + # See also: https://github.com/encode/httpcore/pull/193#issuecomment-703129316 + + # Use select.select on Windows, and when poll is unavailable and select.poll + # everywhere else. (E.g. When eventlet is in use. See #327) + if ( + sys.platform == "win32" or getattr(select, "poll", None) is None + ): # pragma: nocover + rready, _, _ = select.select([sock_fd], [], [], 0) + return bool(rready) + p = select.poll() + p.register(sock_fd, select.POLLIN) + return bool(p.poll(0)) diff --git a/env/lib/python3.13/site-packages/httpcore/py.typed b/env/lib/python3.13/site-packages/httpcore/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/INSTALLER b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/METADATA b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..b0d2b196385e98259971519793447c1fd7a9a643 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/METADATA @@ -0,0 +1,203 @@ +Metadata-Version: 2.3 +Name: httpx +Version: 0.28.1 +Summary: The next generation HTTP client. +Project-URL: Changelog, https://github.com/encode/httpx/blob/master/CHANGELOG.md +Project-URL: Documentation, https://www.python-httpx.org +Project-URL: Homepage, https://github.com/encode/httpx +Project-URL: Source, https://github.com/encode/httpx +Author-email: Tom Christie +License: BSD-3-Clause +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Web Environment +Classifier: Framework :: AsyncIO +Classifier: Framework :: Trio +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Topic :: Internet :: WWW/HTTP +Requires-Python: >=3.8 +Requires-Dist: anyio +Requires-Dist: certifi +Requires-Dist: httpcore==1.* +Requires-Dist: idna +Provides-Extra: brotli +Requires-Dist: brotli; (platform_python_implementation == 'CPython') and extra == 'brotli' +Requires-Dist: brotlicffi; (platform_python_implementation != 'CPython') and extra == 'brotli' +Provides-Extra: cli +Requires-Dist: click==8.*; extra == 'cli' +Requires-Dist: pygments==2.*; extra == 'cli' +Requires-Dist: rich<14,>=10; extra == 'cli' +Provides-Extra: http2 +Requires-Dist: h2<5,>=3; extra == 'http2' +Provides-Extra: socks +Requires-Dist: socksio==1.*; extra == 'socks' +Provides-Extra: zstd +Requires-Dist: zstandard>=0.18.0; extra == 'zstd' +Description-Content-Type: text/markdown + +

+ HTTPX +

+ +

HTTPX - A next-generation HTTP client for Python.

+ +

+ + Test Suite + + + Package version + +

+ +HTTPX is a fully featured HTTP client library for Python 3. It includes **an integrated command line client**, has support for both **HTTP/1.1 and HTTP/2**, and provides both **sync and async APIs**. + +--- + +Install HTTPX using pip: + +```shell +$ pip install httpx +``` + +Now, let's get started: + +```pycon +>>> import httpx +>>> r = httpx.get('https://www.example.org/') +>>> r + +>>> r.status_code +200 +>>> r.headers['content-type'] +'text/html; charset=UTF-8' +>>> r.text +'\n\n\nExample Domain...' +``` + +Or, using the command-line client. + +```shell +$ pip install 'httpx[cli]' # The command line client is an optional dependency. +``` + +Which now allows us to use HTTPX directly from the command-line... + +

+ httpx --help +

+ +Sending a request... + +

+ httpx http://httpbin.org/json +

+ +## Features + +HTTPX builds on the well-established usability of `requests`, and gives you: + +* A broadly [requests-compatible API](https://www.python-httpx.org/compatibility/). +* An integrated command-line client. +* HTTP/1.1 [and HTTP/2 support](https://www.python-httpx.org/http2/). +* Standard synchronous interface, but with [async support if you need it](https://www.python-httpx.org/async/). +* Ability to make requests directly to [WSGI applications](https://www.python-httpx.org/advanced/transports/#wsgi-transport) or [ASGI applications](https://www.python-httpx.org/advanced/transports/#asgi-transport). +* Strict timeouts everywhere. +* Fully type annotated. +* 100% test coverage. + +Plus all the standard features of `requests`... + +* International Domains and URLs +* Keep-Alive & Connection Pooling +* Sessions with Cookie Persistence +* Browser-style SSL Verification +* Basic/Digest Authentication +* Elegant Key/Value Cookies +* Automatic Decompression +* Automatic Content Decoding +* Unicode Response Bodies +* Multipart File Uploads +* HTTP(S) Proxy Support +* Connection Timeouts +* Streaming Downloads +* .netrc Support +* Chunked Requests + +## Installation + +Install with pip: + +```shell +$ pip install httpx +``` + +Or, to include the optional HTTP/2 support, use: + +```shell +$ pip install httpx[http2] +``` + +HTTPX requires Python 3.8+. + +## Documentation + +Project documentation is available at [https://www.python-httpx.org/](https://www.python-httpx.org/). + +For a run-through of all the basics, head over to the [QuickStart](https://www.python-httpx.org/quickstart/). + +For more advanced topics, see the [Advanced Usage](https://www.python-httpx.org/advanced/) section, the [async support](https://www.python-httpx.org/async/) section, or the [HTTP/2](https://www.python-httpx.org/http2/) section. + +The [Developer Interface](https://www.python-httpx.org/api/) provides a comprehensive API reference. + +To find out about tools that integrate with HTTPX, see [Third Party Packages](https://www.python-httpx.org/third_party_packages/). + +## Contribute + +If you want to contribute with HTTPX check out the [Contributing Guide](https://www.python-httpx.org/contributing/) to learn how to start. + +## Dependencies + +The HTTPX project relies on these excellent libraries: + +* `httpcore` - The underlying transport implementation for `httpx`. + * `h11` - HTTP/1.1 support. +* `certifi` - SSL certificates. +* `idna` - Internationalized domain name support. +* `sniffio` - Async library autodetection. + +As well as these optional installs: + +* `h2` - HTTP/2 support. *(Optional, with `httpx[http2]`)* +* `socksio` - SOCKS proxy support. *(Optional, with `httpx[socks]`)* +* `rich` - Rich terminal support. *(Optional, with `httpx[cli]`)* +* `click` - Command line client support. *(Optional, with `httpx[cli]`)* +* `brotli` or `brotlicffi` - Decoding for "brotli" compressed responses. *(Optional, with `httpx[brotli]`)* +* `zstandard` - Decoding for "zstd" compressed responses. *(Optional, with `httpx[zstd]`)* + +A huge amount of credit is due to `requests` for the API layout that +much of this work follows, as well as to `urllib3` for plenty of design +inspiration around the lower-level networking details. + +--- + +

HTTPX is BSD licensed code.
Designed & crafted with care.

— 🦋 —

+ +## Release Information + +### Fixed + +* Reintroduced supposedly-private `URLTypes` shortcut. (#2673) + + +--- + +[Full changelog](https://github.com/encode/httpx/blob/master/CHANGELOG.md) diff --git a/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/RECORD b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..215a1ac419f441d85c5e303930b7c7563f5f4951 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/RECORD @@ -0,0 +1,54 @@ +../../../bin/httpx,sha256=Rdk5LW602TkuZHquotWRLLc4TnaaXhrPLmhrYBsr8js,253 +httpx-0.28.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +httpx-0.28.1.dist-info/METADATA,sha256=_rubD48-gNV8gZnDBPNcQzboWB0dGNeYPJJ2a4J5OyU,7052 +httpx-0.28.1.dist-info/RECORD,, +httpx-0.28.1.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87 +httpx-0.28.1.dist-info/entry_points.txt,sha256=2lVkdQmxLA1pNMgSN2eV89o90HCZezhmNwsy6ryKDSA,37 +httpx-0.28.1.dist-info/licenses/LICENSE.md,sha256=TsWdVE8StfU5o6cW_TIaxYzNgDC0ZSIfLIgCAM3yjY0,1508 +httpx/__init__.py,sha256=CsaZe6yZj0rHg6322AWKWHGTMVr9txgEfD5P3_Rrz60,2171 +httpx/__pycache__/__init__.cpython-313.pyc,, +httpx/__pycache__/__version__.cpython-313.pyc,, +httpx/__pycache__/_api.cpython-313.pyc,, +httpx/__pycache__/_auth.cpython-313.pyc,, +httpx/__pycache__/_client.cpython-313.pyc,, +httpx/__pycache__/_config.cpython-313.pyc,, +httpx/__pycache__/_content.cpython-313.pyc,, +httpx/__pycache__/_decoders.cpython-313.pyc,, +httpx/__pycache__/_exceptions.cpython-313.pyc,, +httpx/__pycache__/_main.cpython-313.pyc,, +httpx/__pycache__/_models.cpython-313.pyc,, +httpx/__pycache__/_multipart.cpython-313.pyc,, +httpx/__pycache__/_status_codes.cpython-313.pyc,, +httpx/__pycache__/_types.cpython-313.pyc,, +httpx/__pycache__/_urlparse.cpython-313.pyc,, +httpx/__pycache__/_urls.cpython-313.pyc,, +httpx/__pycache__/_utils.cpython-313.pyc,, +httpx/__version__.py,sha256=LoUyYeOXTieGzuP_64UL0wxdtxjuu_QbOvE7NOg-IqU,108 +httpx/_api.py,sha256=r_Zgs4jIpcPJLqK5dbbSayqo_iVMKFaxZCd-oOHxLEs,11743 +httpx/_auth.py,sha256=Yr3QwaUSK17rGYx-7j-FdicFIzz4Y9FFV-1F4-7RXX4,11891 +httpx/_client.py,sha256=xD-UG67-WMkeltAAOeGGj-cZ2RRTAm19sWRxlFY7_40,65714 +httpx/_config.py,sha256=pPp2U-wicfcKsF-KYRE1LYdt3e6ERGeIoXZ8Gjo3LWc,8547 +httpx/_content.py,sha256=LGGzrJTR3OvN4Mb1GVVNLXkXJH-6oKlwAttO9p5w_yg,8161 +httpx/_decoders.py,sha256=p0dX8I0NEHexs3UGp4SsZutiMhsXrrWl6-GnqVb0iKM,12041 +httpx/_exceptions.py,sha256=bxW7fxzgVMAdNTbwT0Vnq04gJDW1_gI_GFiQPuMyjL0,8527 +httpx/_main.py,sha256=Cg9GMabiTT_swaDfUgIRitSwxLRMSwUDOm7LdSGqlA4,15626 +httpx/_models.py,sha256=4__Guyv1gLxuZChwim8kfQNiIOcJ9acreFOSurvZfms,44700 +httpx/_multipart.py,sha256=KOHEZZl6oohg9mPaKyyu345qq1rJLg35TUG3YAzXB3Y,9843 +httpx/_status_codes.py,sha256=DYn-2ufBgMeXy5s8x3_TB7wjAuAAMewTakPrm5rXEsc,5639 +httpx/_transports/__init__.py,sha256=GbUoBSAOp7z-l-9j5YhMhR3DMIcn6FVLhj072O3Nnno,275 +httpx/_transports/__pycache__/__init__.cpython-313.pyc,, +httpx/_transports/__pycache__/asgi.cpython-313.pyc,, +httpx/_transports/__pycache__/base.cpython-313.pyc,, +httpx/_transports/__pycache__/default.cpython-313.pyc,, +httpx/_transports/__pycache__/mock.cpython-313.pyc,, +httpx/_transports/__pycache__/wsgi.cpython-313.pyc,, +httpx/_transports/asgi.py,sha256=HRfiDYMPt4wQH2gFgHZg4c-i3sblo6bL5GTqcET-xz8,5501 +httpx/_transports/base.py,sha256=kZS_VMbViYfF570pogUCJ1bulz-ybfL51Pqs9yktebU,2523 +httpx/_transports/default.py,sha256=AzeaRUyVwCccTyyNJexDf0n1dFfzzydpdIQgvw7PLnk,13983 +httpx/_transports/mock.py,sha256=PTo0d567RITXxGrki6kN7_67wwAxfwiMDcuXJiZCjEo,1232 +httpx/_transports/wsgi.py,sha256=NcPX3Xap_EwCFZWO_OaSyQNuInCYx1QMNbO8GAei6jY,4825 +httpx/_types.py,sha256=Jyh41GQq7AOev8IOWKDAg7zCbvHAfufmW5g_PiTtErY,2965 +httpx/_urlparse.py,sha256=ZAmH47ONfkxrrj-PPYhGeiHjb6AjKCS-ANWIN4OL_KY,18546 +httpx/_urls.py,sha256=dX99VR1DSOHpgo9Aq7PzYO4FKdxqKjwyNp8grf8dHN0,21550 +httpx/_utils.py,sha256=_TVeqAKvxJkKHdz7dFeb4s0LZqQXgeFkXSgfiHBK_1o,8285 +httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 diff --git a/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/WHEEL b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..21aaa72961a8af71c17d2cb3b76d5f7f567100e4 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/WHEEL @@ -0,0 +1,4 @@ +Wheel-Version: 1.0 +Generator: hatchling 1.26.3 +Root-Is-Purelib: true +Tag: py3-none-any diff --git a/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/entry_points.txt b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ae96007f7d725813fd02dc1d06d3834ee1939e4 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx-0.28.1.dist-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +httpx = httpx:main diff --git a/env/lib/python3.13/site-packages/httpx/__init__.py b/env/lib/python3.13/site-packages/httpx/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e9addde071f81758baf350c4ab6bde2556340131 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/__init__.py @@ -0,0 +1,105 @@ +from .__version__ import __description__, __title__, __version__ +from ._api import * +from ._auth import * +from ._client import * +from ._config import * +from ._content import * +from ._exceptions import * +from ._models import * +from ._status_codes import * +from ._transports import * +from ._types import * +from ._urls import * + +try: + from ._main import main +except ImportError: # pragma: no cover + + def main() -> None: # type: ignore + import sys + + print( + "The httpx command line client could not run because the required " + "dependencies were not installed.\nMake sure you've installed " + "everything with: pip install 'httpx[cli]'" + ) + sys.exit(1) + + +__all__ = [ + "__description__", + "__title__", + "__version__", + "ASGITransport", + "AsyncBaseTransport", + "AsyncByteStream", + "AsyncClient", + "AsyncHTTPTransport", + "Auth", + "BaseTransport", + "BasicAuth", + "ByteStream", + "Client", + "CloseError", + "codes", + "ConnectError", + "ConnectTimeout", + "CookieConflict", + "Cookies", + "create_ssl_context", + "DecodingError", + "delete", + "DigestAuth", + "get", + "head", + "Headers", + "HTTPError", + "HTTPStatusError", + "HTTPTransport", + "InvalidURL", + "Limits", + "LocalProtocolError", + "main", + "MockTransport", + "NetRCAuth", + "NetworkError", + "options", + "patch", + "PoolTimeout", + "post", + "ProtocolError", + "Proxy", + "ProxyError", + "put", + "QueryParams", + "ReadError", + "ReadTimeout", + "RemoteProtocolError", + "request", + "Request", + "RequestError", + "RequestNotRead", + "Response", + "ResponseNotRead", + "stream", + "StreamClosed", + "StreamConsumed", + "StreamError", + "SyncByteStream", + "Timeout", + "TimeoutException", + "TooManyRedirects", + "TransportError", + "UnsupportedProtocol", + "URL", + "USE_CLIENT_DEFAULT", + "WriteError", + "WriteTimeout", + "WSGITransport", +] + + +__locals = locals() +for __name in __all__: + if not __name.startswith("__"): + setattr(__locals[__name], "__module__", "httpx") # noqa diff --git a/env/lib/python3.13/site-packages/httpx/__version__.py b/env/lib/python3.13/site-packages/httpx/__version__.py new file mode 100644 index 0000000000000000000000000000000000000000..801bfacf671017cfbebf1ac26ec385daa02ed260 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/__version__.py @@ -0,0 +1,3 @@ +__title__ = "httpx" +__description__ = "A next generation HTTP client, for Python 3." +__version__ = "0.28.1" diff --git a/env/lib/python3.13/site-packages/httpx/_api.py b/env/lib/python3.13/site-packages/httpx/_api.py new file mode 100644 index 0000000000000000000000000000000000000000..c3cda1ecda8629edbdca2e3bc04bc51dba5e1430 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_api.py @@ -0,0 +1,438 @@ +from __future__ import annotations + +import typing +from contextlib import contextmanager + +from ._client import Client +from ._config import DEFAULT_TIMEOUT_CONFIG +from ._models import Response +from ._types import ( + AuthTypes, + CookieTypes, + HeaderTypes, + ProxyTypes, + QueryParamTypes, + RequestContent, + RequestData, + RequestFiles, + TimeoutTypes, +) +from ._urls import URL + +if typing.TYPE_CHECKING: + import ssl # pragma: no cover + + +__all__ = [ + "delete", + "get", + "head", + "options", + "patch", + "post", + "put", + "request", + "stream", +] + + +def request( + method: str, + url: URL | str, + *, + params: QueryParamTypes | None = None, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + trust_env: bool = True, +) -> Response: + """ + Sends an HTTP request. + + **Parameters:** + + * **method** - HTTP method for the new `Request` object: `GET`, `OPTIONS`, + `HEAD`, `POST`, `PUT`, `PATCH`, or `DELETE`. + * **url** - URL for the new `Request` object. + * **params** - *(optional)* Query parameters to include in the URL, as a + string, dictionary, or sequence of two-tuples. + * **content** - *(optional)* Binary content to include in the body of the + request, as bytes or a byte iterator. + * **data** - *(optional)* Form data to include in the body of the request, + as a dictionary. + * **files** - *(optional)* A dictionary of upload files to include in the + body of the request. + * **json** - *(optional)* A JSON serializable object to include in the body + of the request. + * **headers** - *(optional)* Dictionary of HTTP headers to include in the + request. + * **cookies** - *(optional)* Dictionary of Cookie items to include in the + request. + * **auth** - *(optional)* An authentication class to use when sending the + request. + * **proxy** - *(optional)* A proxy URL where all the traffic should be routed. + * **timeout** - *(optional)* The timeout configuration to use when sending + the request. + * **follow_redirects** - *(optional)* Enables or disables HTTP redirects. + * **verify** - *(optional)* Either `True` to use an SSL context with the + default CA bundle, `False` to disable verification, or an instance of + `ssl.SSLContext` to use a custom context. + * **trust_env** - *(optional)* Enables or disables usage of environment + variables for configuration. + + **Returns:** `Response` + + Usage: + + ``` + >>> import httpx + >>> response = httpx.request('GET', 'https://httpbin.org/get') + >>> response + + ``` + """ + with Client( + cookies=cookies, + proxy=proxy, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) as client: + return client.request( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + auth=auth, + follow_redirects=follow_redirects, + ) + + +@contextmanager +def stream( + method: str, + url: URL | str, + *, + params: QueryParamTypes | None = None, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + trust_env: bool = True, +) -> typing.Iterator[Response]: + """ + Alternative to `httpx.request()` that streams the response body + instead of loading it into memory at once. + + **Parameters**: See `httpx.request`. + + See also: [Streaming Responses][0] + + [0]: /quickstart#streaming-responses + """ + with Client( + cookies=cookies, + proxy=proxy, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) as client: + with client.stream( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + auth=auth, + follow_redirects=follow_redirects, + ) as response: + yield response + + +def get( + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends a `GET` request. + + **Parameters**: See `httpx.request`. + + Note that the `data`, `files`, `json` and `content` parameters are not available + on this function, as `GET` requests should not include a request body. + """ + return request( + "GET", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def options( + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends an `OPTIONS` request. + + **Parameters**: See `httpx.request`. + + Note that the `data`, `files`, `json` and `content` parameters are not available + on this function, as `OPTIONS` requests should not include a request body. + """ + return request( + "OPTIONS", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def head( + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends a `HEAD` request. + + **Parameters**: See `httpx.request`. + + Note that the `data`, `files`, `json` and `content` parameters are not available + on this function, as `HEAD` requests should not include a request body. + """ + return request( + "HEAD", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def post( + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends a `POST` request. + + **Parameters**: See `httpx.request`. + """ + return request( + "POST", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def put( + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends a `PUT` request. + + **Parameters**: See `httpx.request`. + """ + return request( + "PUT", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def patch( + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + verify: ssl.SSLContext | str | bool = True, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + trust_env: bool = True, +) -> Response: + """ + Sends a `PATCH` request. + + **Parameters**: See `httpx.request`. + """ + return request( + "PATCH", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) + + +def delete( + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | None = None, + proxy: ProxyTypes | None = None, + follow_redirects: bool = False, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + verify: ssl.SSLContext | str | bool = True, + trust_env: bool = True, +) -> Response: + """ + Sends a `DELETE` request. + + **Parameters**: See `httpx.request`. + + Note that the `data`, `files`, `json` and `content` parameters are not available + on this function, as `DELETE` requests should not include a request body. + """ + return request( + "DELETE", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + proxy=proxy, + follow_redirects=follow_redirects, + verify=verify, + timeout=timeout, + trust_env=trust_env, + ) diff --git a/env/lib/python3.13/site-packages/httpx/_auth.py b/env/lib/python3.13/site-packages/httpx/_auth.py new file mode 100644 index 0000000000000000000000000000000000000000..b03971ab4b311d60790dc22ca24d9966426ec0a4 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_auth.py @@ -0,0 +1,348 @@ +from __future__ import annotations + +import hashlib +import os +import re +import time +import typing +from base64 import b64encode +from urllib.request import parse_http_list + +from ._exceptions import ProtocolError +from ._models import Cookies, Request, Response +from ._utils import to_bytes, to_str, unquote + +if typing.TYPE_CHECKING: # pragma: no cover + from hashlib import _Hash + + +__all__ = ["Auth", "BasicAuth", "DigestAuth", "NetRCAuth"] + + +class Auth: + """ + Base class for all authentication schemes. + + To implement a custom authentication scheme, subclass `Auth` and override + the `.auth_flow()` method. + + If the authentication scheme does I/O such as disk access or network calls, or uses + synchronization primitives such as locks, you should override `.sync_auth_flow()` + and/or `.async_auth_flow()` instead of `.auth_flow()` to provide specialized + implementations that will be used by `Client` and `AsyncClient` respectively. + """ + + requires_request_body = False + requires_response_body = False + + def auth_flow(self, request: Request) -> typing.Generator[Request, Response, None]: + """ + Execute the authentication flow. + + To dispatch a request, `yield` it: + + ``` + yield request + ``` + + The client will `.send()` the response back into the flow generator. You can + access it like so: + + ``` + response = yield request + ``` + + A `return` (or reaching the end of the generator) will result in the + client returning the last response obtained from the server. + + You can dispatch as many requests as is necessary. + """ + yield request + + def sync_auth_flow( + self, request: Request + ) -> typing.Generator[Request, Response, None]: + """ + Execute the authentication flow synchronously. + + By default, this defers to `.auth_flow()`. You should override this method + when the authentication scheme does I/O and/or uses concurrency primitives. + """ + if self.requires_request_body: + request.read() + + flow = self.auth_flow(request) + request = next(flow) + + while True: + response = yield request + if self.requires_response_body: + response.read() + + try: + request = flow.send(response) + except StopIteration: + break + + async def async_auth_flow( + self, request: Request + ) -> typing.AsyncGenerator[Request, Response]: + """ + Execute the authentication flow asynchronously. + + By default, this defers to `.auth_flow()`. You should override this method + when the authentication scheme does I/O and/or uses concurrency primitives. + """ + if self.requires_request_body: + await request.aread() + + flow = self.auth_flow(request) + request = next(flow) + + while True: + response = yield request + if self.requires_response_body: + await response.aread() + + try: + request = flow.send(response) + except StopIteration: + break + + +class FunctionAuth(Auth): + """ + Allows the 'auth' argument to be passed as a simple callable function, + that takes the request, and returns a new, modified request. + """ + + def __init__(self, func: typing.Callable[[Request], Request]) -> None: + self._func = func + + def auth_flow(self, request: Request) -> typing.Generator[Request, Response, None]: + yield self._func(request) + + +class BasicAuth(Auth): + """ + Allows the 'auth' argument to be passed as a (username, password) pair, + and uses HTTP Basic authentication. + """ + + def __init__(self, username: str | bytes, password: str | bytes) -> None: + self._auth_header = self._build_auth_header(username, password) + + def auth_flow(self, request: Request) -> typing.Generator[Request, Response, None]: + request.headers["Authorization"] = self._auth_header + yield request + + def _build_auth_header(self, username: str | bytes, password: str | bytes) -> str: + userpass = b":".join((to_bytes(username), to_bytes(password))) + token = b64encode(userpass).decode() + return f"Basic {token}" + + +class NetRCAuth(Auth): + """ + Use a 'netrc' file to lookup basic auth credentials based on the url host. + """ + + def __init__(self, file: str | None = None) -> None: + # Lazily import 'netrc'. + # There's no need for us to load this module unless 'NetRCAuth' is being used. + import netrc + + self._netrc_info = netrc.netrc(file) + + def auth_flow(self, request: Request) -> typing.Generator[Request, Response, None]: + auth_info = self._netrc_info.authenticators(request.url.host) + if auth_info is None or not auth_info[2]: + # The netrc file did not have authentication credentials for this host. + yield request + else: + # Build a basic auth header with credentials from the netrc file. + request.headers["Authorization"] = self._build_auth_header( + username=auth_info[0], password=auth_info[2] + ) + yield request + + def _build_auth_header(self, username: str | bytes, password: str | bytes) -> str: + userpass = b":".join((to_bytes(username), to_bytes(password))) + token = b64encode(userpass).decode() + return f"Basic {token}" + + +class DigestAuth(Auth): + _ALGORITHM_TO_HASH_FUNCTION: dict[str, typing.Callable[[bytes], _Hash]] = { + "MD5": hashlib.md5, + "MD5-SESS": hashlib.md5, + "SHA": hashlib.sha1, + "SHA-SESS": hashlib.sha1, + "SHA-256": hashlib.sha256, + "SHA-256-SESS": hashlib.sha256, + "SHA-512": hashlib.sha512, + "SHA-512-SESS": hashlib.sha512, + } + + def __init__(self, username: str | bytes, password: str | bytes) -> None: + self._username = to_bytes(username) + self._password = to_bytes(password) + self._last_challenge: _DigestAuthChallenge | None = None + self._nonce_count = 1 + + def auth_flow(self, request: Request) -> typing.Generator[Request, Response, None]: + if self._last_challenge: + request.headers["Authorization"] = self._build_auth_header( + request, self._last_challenge + ) + + response = yield request + + if response.status_code != 401 or "www-authenticate" not in response.headers: + # If the response is not a 401 then we don't + # need to build an authenticated request. + return + + for auth_header in response.headers.get_list("www-authenticate"): + if auth_header.lower().startswith("digest "): + break + else: + # If the response does not include a 'WWW-Authenticate: Digest ...' + # header, then we don't need to build an authenticated request. + return + + self._last_challenge = self._parse_challenge(request, response, auth_header) + self._nonce_count = 1 + + request.headers["Authorization"] = self._build_auth_header( + request, self._last_challenge + ) + if response.cookies: + Cookies(response.cookies).set_cookie_header(request=request) + yield request + + def _parse_challenge( + self, request: Request, response: Response, auth_header: str + ) -> _DigestAuthChallenge: + """ + Returns a challenge from a Digest WWW-Authenticate header. + These take the form of: + `Digest realm="realm@host.com",qop="auth,auth-int",nonce="abc",opaque="xyz"` + """ + scheme, _, fields = auth_header.partition(" ") + + # This method should only ever have been called with a Digest auth header. + assert scheme.lower() == "digest" + + header_dict: dict[str, str] = {} + for field in parse_http_list(fields): + key, value = field.strip().split("=", 1) + header_dict[key] = unquote(value) + + try: + realm = header_dict["realm"].encode() + nonce = header_dict["nonce"].encode() + algorithm = header_dict.get("algorithm", "MD5") + opaque = header_dict["opaque"].encode() if "opaque" in header_dict else None + qop = header_dict["qop"].encode() if "qop" in header_dict else None + return _DigestAuthChallenge( + realm=realm, nonce=nonce, algorithm=algorithm, opaque=opaque, qop=qop + ) + except KeyError as exc: + message = "Malformed Digest WWW-Authenticate header" + raise ProtocolError(message, request=request) from exc + + def _build_auth_header( + self, request: Request, challenge: _DigestAuthChallenge + ) -> str: + hash_func = self._ALGORITHM_TO_HASH_FUNCTION[challenge.algorithm.upper()] + + def digest(data: bytes) -> bytes: + return hash_func(data).hexdigest().encode() + + A1 = b":".join((self._username, challenge.realm, self._password)) + + path = request.url.raw_path + A2 = b":".join((request.method.encode(), path)) + # TODO: implement auth-int + HA2 = digest(A2) + + nc_value = b"%08x" % self._nonce_count + cnonce = self._get_client_nonce(self._nonce_count, challenge.nonce) + self._nonce_count += 1 + + HA1 = digest(A1) + if challenge.algorithm.lower().endswith("-sess"): + HA1 = digest(b":".join((HA1, challenge.nonce, cnonce))) + + qop = self._resolve_qop(challenge.qop, request=request) + if qop is None: + # Following RFC 2069 + digest_data = [HA1, challenge.nonce, HA2] + else: + # Following RFC 2617/7616 + digest_data = [HA1, challenge.nonce, nc_value, cnonce, qop, HA2] + + format_args = { + "username": self._username, + "realm": challenge.realm, + "nonce": challenge.nonce, + "uri": path, + "response": digest(b":".join(digest_data)), + "algorithm": challenge.algorithm.encode(), + } + if challenge.opaque: + format_args["opaque"] = challenge.opaque + if qop: + format_args["qop"] = b"auth" + format_args["nc"] = nc_value + format_args["cnonce"] = cnonce + + return "Digest " + self._get_header_value(format_args) + + def _get_client_nonce(self, nonce_count: int, nonce: bytes) -> bytes: + s = str(nonce_count).encode() + s += nonce + s += time.ctime().encode() + s += os.urandom(8) + + return hashlib.sha1(s).hexdigest()[:16].encode() + + def _get_header_value(self, header_fields: dict[str, bytes]) -> str: + NON_QUOTED_FIELDS = ("algorithm", "qop", "nc") + QUOTED_TEMPLATE = '{}="{}"' + NON_QUOTED_TEMPLATE = "{}={}" + + header_value = "" + for i, (field, value) in enumerate(header_fields.items()): + if i > 0: + header_value += ", " + template = ( + QUOTED_TEMPLATE + if field not in NON_QUOTED_FIELDS + else NON_QUOTED_TEMPLATE + ) + header_value += template.format(field, to_str(value)) + + return header_value + + def _resolve_qop(self, qop: bytes | None, request: Request) -> bytes | None: + if qop is None: + return None + qops = re.split(b", ?", qop) + if b"auth" in qops: + return b"auth" + + if qops == [b"auth-int"]: + raise NotImplementedError("Digest auth-int support is not yet implemented") + + message = f'Unexpected qop value "{qop!r}" in digest auth' + raise ProtocolError(message, request=request) + + +class _DigestAuthChallenge(typing.NamedTuple): + realm: bytes + nonce: bytes + algorithm: str + opaque: bytes | None + qop: bytes | None diff --git a/env/lib/python3.13/site-packages/httpx/_client.py b/env/lib/python3.13/site-packages/httpx/_client.py new file mode 100644 index 0000000000000000000000000000000000000000..2249231f8c3b912c731ff160344d3672e2f11738 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_client.py @@ -0,0 +1,2019 @@ +from __future__ import annotations + +import datetime +import enum +import logging +import time +import typing +import warnings +from contextlib import asynccontextmanager, contextmanager +from types import TracebackType + +from .__version__ import __version__ +from ._auth import Auth, BasicAuth, FunctionAuth +from ._config import ( + DEFAULT_LIMITS, + DEFAULT_MAX_REDIRECTS, + DEFAULT_TIMEOUT_CONFIG, + Limits, + Proxy, + Timeout, +) +from ._decoders import SUPPORTED_DECODERS +from ._exceptions import ( + InvalidURL, + RemoteProtocolError, + TooManyRedirects, + request_context, +) +from ._models import Cookies, Headers, Request, Response +from ._status_codes import codes +from ._transports.base import AsyncBaseTransport, BaseTransport +from ._transports.default import AsyncHTTPTransport, HTTPTransport +from ._types import ( + AsyncByteStream, + AuthTypes, + CertTypes, + CookieTypes, + HeaderTypes, + ProxyTypes, + QueryParamTypes, + RequestContent, + RequestData, + RequestExtensions, + RequestFiles, + SyncByteStream, + TimeoutTypes, +) +from ._urls import URL, QueryParams +from ._utils import URLPattern, get_environment_proxies + +if typing.TYPE_CHECKING: + import ssl # pragma: no cover + +__all__ = ["USE_CLIENT_DEFAULT", "AsyncClient", "Client"] + +# The type annotation for @classmethod and context managers here follows PEP 484 +# https://www.python.org/dev/peps/pep-0484/#annotating-instance-and-class-methods +T = typing.TypeVar("T", bound="Client") +U = typing.TypeVar("U", bound="AsyncClient") + + +def _is_https_redirect(url: URL, location: URL) -> bool: + """ + Return 'True' if 'location' is a HTTPS upgrade of 'url' + """ + if url.host != location.host: + return False + + return ( + url.scheme == "http" + and _port_or_default(url) == 80 + and location.scheme == "https" + and _port_or_default(location) == 443 + ) + + +def _port_or_default(url: URL) -> int | None: + if url.port is not None: + return url.port + return {"http": 80, "https": 443}.get(url.scheme) + + +def _same_origin(url: URL, other: URL) -> bool: + """ + Return 'True' if the given URLs share the same origin. + """ + return ( + url.scheme == other.scheme + and url.host == other.host + and _port_or_default(url) == _port_or_default(other) + ) + + +class UseClientDefault: + """ + For some parameters such as `auth=...` and `timeout=...` we need to be able + to indicate the default "unset" state, in a way that is distinctly different + to using `None`. + + The default "unset" state indicates that whatever default is set on the + client should be used. This is different to setting `None`, which + explicitly disables the parameter, possibly overriding a client default. + + For example we use `timeout=USE_CLIENT_DEFAULT` in the `request()` signature. + Omitting the `timeout` parameter will send a request using whatever default + timeout has been configured on the client. Including `timeout=None` will + ensure no timeout is used. + + Note that user code shouldn't need to use the `USE_CLIENT_DEFAULT` constant, + but it is used internally when a parameter is not included. + """ + + +USE_CLIENT_DEFAULT = UseClientDefault() + + +logger = logging.getLogger("httpx") + +USER_AGENT = f"python-httpx/{__version__}" +ACCEPT_ENCODING = ", ".join( + [key for key in SUPPORTED_DECODERS.keys() if key != "identity"] +) + + +class ClientState(enum.Enum): + # UNOPENED: + # The client has been instantiated, but has not been used to send a request, + # or been opened by entering the context of a `with` block. + UNOPENED = 1 + # OPENED: + # The client has either sent a request, or is within a `with` block. + OPENED = 2 + # CLOSED: + # The client has either exited the `with` block, or `close()` has + # been called explicitly. + CLOSED = 3 + + +class BoundSyncStream(SyncByteStream): + """ + A byte stream that is bound to a given response instance, and that + ensures the `response.elapsed` is set once the response is closed. + """ + + def __init__( + self, stream: SyncByteStream, response: Response, start: float + ) -> None: + self._stream = stream + self._response = response + self._start = start + + def __iter__(self) -> typing.Iterator[bytes]: + for chunk in self._stream: + yield chunk + + def close(self) -> None: + elapsed = time.perf_counter() - self._start + self._response.elapsed = datetime.timedelta(seconds=elapsed) + self._stream.close() + + +class BoundAsyncStream(AsyncByteStream): + """ + An async byte stream that is bound to a given response instance, and that + ensures the `response.elapsed` is set once the response is closed. + """ + + def __init__( + self, stream: AsyncByteStream, response: Response, start: float + ) -> None: + self._stream = stream + self._response = response + self._start = start + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + async for chunk in self._stream: + yield chunk + + async def aclose(self) -> None: + elapsed = time.perf_counter() - self._start + self._response.elapsed = datetime.timedelta(seconds=elapsed) + await self._stream.aclose() + + +EventHook = typing.Callable[..., typing.Any] + + +class BaseClient: + def __init__( + self, + *, + auth: AuthTypes | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + follow_redirects: bool = False, + max_redirects: int = DEFAULT_MAX_REDIRECTS, + event_hooks: None | (typing.Mapping[str, list[EventHook]]) = None, + base_url: URL | str = "", + trust_env: bool = True, + default_encoding: str | typing.Callable[[bytes], str] = "utf-8", + ) -> None: + event_hooks = {} if event_hooks is None else event_hooks + + self._base_url = self._enforce_trailing_slash(URL(base_url)) + + self._auth = self._build_auth(auth) + self._params = QueryParams(params) + self.headers = Headers(headers) + self._cookies = Cookies(cookies) + self._timeout = Timeout(timeout) + self.follow_redirects = follow_redirects + self.max_redirects = max_redirects + self._event_hooks = { + "request": list(event_hooks.get("request", [])), + "response": list(event_hooks.get("response", [])), + } + self._trust_env = trust_env + self._default_encoding = default_encoding + self._state = ClientState.UNOPENED + + @property + def is_closed(self) -> bool: + """ + Check if the client being closed + """ + return self._state == ClientState.CLOSED + + @property + def trust_env(self) -> bool: + return self._trust_env + + def _enforce_trailing_slash(self, url: URL) -> URL: + if url.raw_path.endswith(b"/"): + return url + return url.copy_with(raw_path=url.raw_path + b"/") + + def _get_proxy_map( + self, proxy: ProxyTypes | None, allow_env_proxies: bool + ) -> dict[str, Proxy | None]: + if proxy is None: + if allow_env_proxies: + return { + key: None if url is None else Proxy(url=url) + for key, url in get_environment_proxies().items() + } + return {} + else: + proxy = Proxy(url=proxy) if isinstance(proxy, (str, URL)) else proxy + return {"all://": proxy} + + @property + def timeout(self) -> Timeout: + return self._timeout + + @timeout.setter + def timeout(self, timeout: TimeoutTypes) -> None: + self._timeout = Timeout(timeout) + + @property + def event_hooks(self) -> dict[str, list[EventHook]]: + return self._event_hooks + + @event_hooks.setter + def event_hooks(self, event_hooks: dict[str, list[EventHook]]) -> None: + self._event_hooks = { + "request": list(event_hooks.get("request", [])), + "response": list(event_hooks.get("response", [])), + } + + @property + def auth(self) -> Auth | None: + """ + Authentication class used when none is passed at the request-level. + + See also [Authentication][0]. + + [0]: /quickstart/#authentication + """ + return self._auth + + @auth.setter + def auth(self, auth: AuthTypes) -> None: + self._auth = self._build_auth(auth) + + @property + def base_url(self) -> URL: + """ + Base URL to use when sending requests with relative URLs. + """ + return self._base_url + + @base_url.setter + def base_url(self, url: URL | str) -> None: + self._base_url = self._enforce_trailing_slash(URL(url)) + + @property + def headers(self) -> Headers: + """ + HTTP headers to include when sending requests. + """ + return self._headers + + @headers.setter + def headers(self, headers: HeaderTypes) -> None: + client_headers = Headers( + { + b"Accept": b"*/*", + b"Accept-Encoding": ACCEPT_ENCODING.encode("ascii"), + b"Connection": b"keep-alive", + b"User-Agent": USER_AGENT.encode("ascii"), + } + ) + client_headers.update(headers) + self._headers = client_headers + + @property + def cookies(self) -> Cookies: + """ + Cookie values to include when sending requests. + """ + return self._cookies + + @cookies.setter + def cookies(self, cookies: CookieTypes) -> None: + self._cookies = Cookies(cookies) + + @property + def params(self) -> QueryParams: + """ + Query parameters to include in the URL when sending requests. + """ + return self._params + + @params.setter + def params(self, params: QueryParamTypes) -> None: + self._params = QueryParams(params) + + def build_request( + self, + method: str, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Request: + """ + Build and return a request instance. + + * The `params`, `headers` and `cookies` arguments + are merged with any values set on the client. + * The `url` argument is merged with any `base_url` set on the client. + + See also: [Request instances][0] + + [0]: /advanced/clients/#request-instances + """ + url = self._merge_url(url) + headers = self._merge_headers(headers) + cookies = self._merge_cookies(cookies) + params = self._merge_queryparams(params) + extensions = {} if extensions is None else extensions + if "timeout" not in extensions: + timeout = ( + self.timeout + if isinstance(timeout, UseClientDefault) + else Timeout(timeout) + ) + extensions = dict(**extensions, timeout=timeout.as_dict()) + return Request( + method, + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + extensions=extensions, + ) + + def _merge_url(self, url: URL | str) -> URL: + """ + Merge a URL argument together with any 'base_url' on the client, + to create the URL used for the outgoing request. + """ + merge_url = URL(url) + if merge_url.is_relative_url: + # To merge URLs we always append to the base URL. To get this + # behaviour correct we always ensure the base URL ends in a '/' + # separator, and strip any leading '/' from the merge URL. + # + # So, eg... + # + # >>> client = Client(base_url="https://www.example.com/subpath") + # >>> client.base_url + # URL('https://www.example.com/subpath/') + # >>> client.build_request("GET", "/path").url + # URL('https://www.example.com/subpath/path') + merge_raw_path = self.base_url.raw_path + merge_url.raw_path.lstrip(b"/") + return self.base_url.copy_with(raw_path=merge_raw_path) + return merge_url + + def _merge_cookies(self, cookies: CookieTypes | None = None) -> CookieTypes | None: + """ + Merge a cookies argument together with any cookies on the client, + to create the cookies used for the outgoing request. + """ + if cookies or self.cookies: + merged_cookies = Cookies(self.cookies) + merged_cookies.update(cookies) + return merged_cookies + return cookies + + def _merge_headers(self, headers: HeaderTypes | None = None) -> HeaderTypes | None: + """ + Merge a headers argument together with any headers on the client, + to create the headers used for the outgoing request. + """ + merged_headers = Headers(self.headers) + merged_headers.update(headers) + return merged_headers + + def _merge_queryparams( + self, params: QueryParamTypes | None = None + ) -> QueryParamTypes | None: + """ + Merge a queryparams argument together with any queryparams on the client, + to create the queryparams used for the outgoing request. + """ + if params or self.params: + merged_queryparams = QueryParams(self.params) + return merged_queryparams.merge(params) + return params + + def _build_auth(self, auth: AuthTypes | None) -> Auth | None: + if auth is None: + return None + elif isinstance(auth, tuple): + return BasicAuth(username=auth[0], password=auth[1]) + elif isinstance(auth, Auth): + return auth + elif callable(auth): + return FunctionAuth(func=auth) + else: + raise TypeError(f'Invalid "auth" argument: {auth!r}') + + def _build_request_auth( + self, + request: Request, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + ) -> Auth: + auth = ( + self._auth if isinstance(auth, UseClientDefault) else self._build_auth(auth) + ) + + if auth is not None: + return auth + + username, password = request.url.username, request.url.password + if username or password: + return BasicAuth(username=username, password=password) + + return Auth() + + def _build_redirect_request(self, request: Request, response: Response) -> Request: + """ + Given a request and a redirect response, return a new request that + should be used to effect the redirect. + """ + method = self._redirect_method(request, response) + url = self._redirect_url(request, response) + headers = self._redirect_headers(request, url, method) + stream = self._redirect_stream(request, method) + cookies = Cookies(self.cookies) + return Request( + method=method, + url=url, + headers=headers, + cookies=cookies, + stream=stream, + extensions=request.extensions, + ) + + def _redirect_method(self, request: Request, response: Response) -> str: + """ + When being redirected we may want to change the method of the request + based on certain specs or browser behavior. + """ + method = request.method + + # https://tools.ietf.org/html/rfc7231#section-6.4.4 + if response.status_code == codes.SEE_OTHER and method != "HEAD": + method = "GET" + + # Do what the browsers do, despite standards... + # Turn 302s into GETs. + if response.status_code == codes.FOUND and method != "HEAD": + method = "GET" + + # If a POST is responded to with a 301, turn it into a GET. + # This bizarre behaviour is explained in 'requests' issue 1704. + if response.status_code == codes.MOVED_PERMANENTLY and method == "POST": + method = "GET" + + return method + + def _redirect_url(self, request: Request, response: Response) -> URL: + """ + Return the URL for the redirect to follow. + """ + location = response.headers["Location"] + + try: + url = URL(location) + except InvalidURL as exc: + raise RemoteProtocolError( + f"Invalid URL in location header: {exc}.", request=request + ) from None + + # Handle malformed 'Location' headers that are "absolute" form, have no host. + # See: https://github.com/encode/httpx/issues/771 + if url.scheme and not url.host: + url = url.copy_with(host=request.url.host) + + # Facilitate relative 'Location' headers, as allowed by RFC 7231. + # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource') + if url.is_relative_url: + url = request.url.join(url) + + # Attach previous fragment if needed (RFC 7231 7.1.2) + if request.url.fragment and not url.fragment: + url = url.copy_with(fragment=request.url.fragment) + + return url + + def _redirect_headers(self, request: Request, url: URL, method: str) -> Headers: + """ + Return the headers that should be used for the redirect request. + """ + headers = Headers(request.headers) + + if not _same_origin(url, request.url): + if not _is_https_redirect(request.url, url): + # Strip Authorization headers when responses are redirected + # away from the origin. (Except for direct HTTP to HTTPS redirects.) + headers.pop("Authorization", None) + + # Update the Host header. + headers["Host"] = url.netloc.decode("ascii") + + if method != request.method and method == "GET": + # If we've switch to a 'GET' request, then strip any headers which + # are only relevant to the request body. + headers.pop("Content-Length", None) + headers.pop("Transfer-Encoding", None) + + # We should use the client cookie store to determine any cookie header, + # rather than whatever was on the original outgoing request. + headers.pop("Cookie", None) + + return headers + + def _redirect_stream( + self, request: Request, method: str + ) -> SyncByteStream | AsyncByteStream | None: + """ + Return the body that should be used for the redirect request. + """ + if method != request.method and method == "GET": + return None + + return request.stream + + def _set_timeout(self, request: Request) -> None: + if "timeout" not in request.extensions: + timeout = ( + self.timeout + if isinstance(self.timeout, UseClientDefault) + else Timeout(self.timeout) + ) + request.extensions = dict(**request.extensions, timeout=timeout.as_dict()) + + +class Client(BaseClient): + """ + An HTTP client, with connection pooling, HTTP/2, redirects, cookie persistence, etc. + + It can be shared between threads. + + Usage: + + ```python + >>> client = httpx.Client() + >>> response = client.get('https://example.org') + ``` + + **Parameters:** + + * **auth** - *(optional)* An authentication class to use when sending + requests. + * **params** - *(optional)* Query parameters to include in request URLs, as + a string, dictionary, or sequence of two-tuples. + * **headers** - *(optional)* Dictionary of HTTP headers to include when + sending requests. + * **cookies** - *(optional)* Dictionary of Cookie items to include when + sending requests. + * **verify** - *(optional)* Either `True` to use an SSL context with the + default CA bundle, `False` to disable verification, or an instance of + `ssl.SSLContext` to use a custom context. + * **http2** - *(optional)* A boolean indicating if HTTP/2 support should be + enabled. Defaults to `False`. + * **proxy** - *(optional)* A proxy URL where all the traffic should be routed. + * **timeout** - *(optional)* The timeout configuration to use when sending + requests. + * **limits** - *(optional)* The limits configuration to use. + * **max_redirects** - *(optional)* The maximum number of redirect responses + that should be followed. + * **base_url** - *(optional)* A URL to use as the base when building + request URLs. + * **transport** - *(optional)* A transport class to use for sending requests + over the network. + * **trust_env** - *(optional)* Enables or disables usage of environment + variables for configuration. + * **default_encoding** - *(optional)* The default encoding to use for decoding + response text, if no charset information is included in a response Content-Type + header. Set to a callable for automatic character set detection. Default: "utf-8". + """ + + def __init__( + self, + *, + auth: AuthTypes | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, + http1: bool = True, + http2: bool = False, + proxy: ProxyTypes | None = None, + mounts: None | (typing.Mapping[str, BaseTransport | None]) = None, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + follow_redirects: bool = False, + limits: Limits = DEFAULT_LIMITS, + max_redirects: int = DEFAULT_MAX_REDIRECTS, + event_hooks: None | (typing.Mapping[str, list[EventHook]]) = None, + base_url: URL | str = "", + transport: BaseTransport | None = None, + default_encoding: str | typing.Callable[[bytes], str] = "utf-8", + ) -> None: + super().__init__( + auth=auth, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + follow_redirects=follow_redirects, + max_redirects=max_redirects, + event_hooks=event_hooks, + base_url=base_url, + trust_env=trust_env, + default_encoding=default_encoding, + ) + + if http2: + try: + import h2 # noqa + except ImportError: # pragma: no cover + raise ImportError( + "Using http2=True, but the 'h2' package is not installed. " + "Make sure to install httpx using `pip install httpx[http2]`." + ) from None + + allow_env_proxies = trust_env and transport is None + proxy_map = self._get_proxy_map(proxy, allow_env_proxies) + + self._transport = self._init_transport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + transport=transport, + ) + self._mounts: dict[URLPattern, BaseTransport | None] = { + URLPattern(key): None + if proxy is None + else self._init_proxy_transport( + proxy, + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + ) + for key, proxy in proxy_map.items() + } + if mounts is not None: + self._mounts.update( + {URLPattern(key): transport for key, transport in mounts.items()} + ) + + self._mounts = dict(sorted(self._mounts.items())) + + def _init_transport( + self, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, + http1: bool = True, + http2: bool = False, + limits: Limits = DEFAULT_LIMITS, + transport: BaseTransport | None = None, + ) -> BaseTransport: + if transport is not None: + return transport + + return HTTPTransport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + ) + + def _init_proxy_transport( + self, + proxy: Proxy, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, + http1: bool = True, + http2: bool = False, + limits: Limits = DEFAULT_LIMITS, + ) -> BaseTransport: + return HTTPTransport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + proxy=proxy, + ) + + def _transport_for_url(self, url: URL) -> BaseTransport: + """ + Returns the transport instance that should be used for a given URL. + This will either be the standard connection pool, or a proxy. + """ + for pattern, transport in self._mounts.items(): + if pattern.matches(url): + return self._transport if transport is None else transport + + return self._transport + + def request( + self, + method: str, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Build and send a request. + + Equivalent to: + + ```python + request = client.build_request(...) + response = client.send(request, ...) + ``` + + See `Client.build_request()`, `Client.send()` and + [Merging of configuration][0] for how the various parameters + are merged with client-level configuration. + + [0]: /advanced/clients/#merging-of-configuration + """ + if cookies is not None: + message = ( + "Setting per-request cookies=<...> is being deprecated, because " + "the expected behaviour on cookie persistence is ambiguous. Set " + "cookies directly on the client instance instead." + ) + warnings.warn(message, DeprecationWarning, stacklevel=2) + + request = self.build_request( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + extensions=extensions, + ) + return self.send(request, auth=auth, follow_redirects=follow_redirects) + + @contextmanager + def stream( + self, + method: str, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> typing.Iterator[Response]: + """ + Alternative to `httpx.request()` that streams the response body + instead of loading it into memory at once. + + **Parameters**: See `httpx.request`. + + See also: [Streaming Responses][0] + + [0]: /quickstart#streaming-responses + """ + request = self.build_request( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + extensions=extensions, + ) + response = self.send( + request=request, + auth=auth, + follow_redirects=follow_redirects, + stream=True, + ) + try: + yield response + finally: + response.close() + + def send( + self, + request: Request, + *, + stream: bool = False, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + ) -> Response: + """ + Send a request. + + The request is sent as-is, unmodified. + + Typically you'll want to build one with `Client.build_request()` + so that any client-level configuration is merged into the request, + but passing an explicit `httpx.Request()` is supported as well. + + See also: [Request instances][0] + + [0]: /advanced/clients/#request-instances + """ + if self._state == ClientState.CLOSED: + raise RuntimeError("Cannot send a request, as the client has been closed.") + + self._state = ClientState.OPENED + follow_redirects = ( + self.follow_redirects + if isinstance(follow_redirects, UseClientDefault) + else follow_redirects + ) + + self._set_timeout(request) + + auth = self._build_request_auth(request, auth) + + response = self._send_handling_auth( + request, + auth=auth, + follow_redirects=follow_redirects, + history=[], + ) + try: + if not stream: + response.read() + + return response + + except BaseException as exc: + response.close() + raise exc + + def _send_handling_auth( + self, + request: Request, + auth: Auth, + follow_redirects: bool, + history: list[Response], + ) -> Response: + auth_flow = auth.sync_auth_flow(request) + try: + request = next(auth_flow) + + while True: + response = self._send_handling_redirects( + request, + follow_redirects=follow_redirects, + history=history, + ) + try: + try: + next_request = auth_flow.send(response) + except StopIteration: + return response + + response.history = list(history) + response.read() + request = next_request + history.append(response) + + except BaseException as exc: + response.close() + raise exc + finally: + auth_flow.close() + + def _send_handling_redirects( + self, + request: Request, + follow_redirects: bool, + history: list[Response], + ) -> Response: + while True: + if len(history) > self.max_redirects: + raise TooManyRedirects( + "Exceeded maximum allowed redirects.", request=request + ) + + for hook in self._event_hooks["request"]: + hook(request) + + response = self._send_single_request(request) + try: + for hook in self._event_hooks["response"]: + hook(response) + response.history = list(history) + + if not response.has_redirect_location: + return response + + request = self._build_redirect_request(request, response) + history = history + [response] + + if follow_redirects: + response.read() + else: + response.next_request = request + return response + + except BaseException as exc: + response.close() + raise exc + + def _send_single_request(self, request: Request) -> Response: + """ + Sends a single request, without handling any redirections. + """ + transport = self._transport_for_url(request.url) + start = time.perf_counter() + + if not isinstance(request.stream, SyncByteStream): + raise RuntimeError( + "Attempted to send an async request with a sync Client instance." + ) + + with request_context(request=request): + response = transport.handle_request(request) + + assert isinstance(response.stream, SyncByteStream) + + response.request = request + response.stream = BoundSyncStream( + response.stream, response=response, start=start + ) + self.cookies.extract_cookies(response) + response.default_encoding = self._default_encoding + + logger.info( + 'HTTP Request: %s %s "%s %d %s"', + request.method, + request.url, + response.http_version, + response.status_code, + response.reason_phrase, + ) + + return response + + def get( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `GET` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "GET", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def options( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send an `OPTIONS` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "OPTIONS", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def head( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `HEAD` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "HEAD", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def post( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `POST` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "POST", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def put( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `PUT` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "PUT", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def patch( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `PATCH` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "PATCH", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def delete( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `DELETE` request. + + **Parameters**: See `httpx.request`. + """ + return self.request( + "DELETE", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + def close(self) -> None: + """ + Close transport and proxies. + """ + if self._state != ClientState.CLOSED: + self._state = ClientState.CLOSED + + self._transport.close() + for transport in self._mounts.values(): + if transport is not None: + transport.close() + + def __enter__(self: T) -> T: + if self._state != ClientState.UNOPENED: + msg = { + ClientState.OPENED: "Cannot open a client instance more than once.", + ClientState.CLOSED: ( + "Cannot reopen a client instance, once it has been closed." + ), + }[self._state] + raise RuntimeError(msg) + + self._state = ClientState.OPENED + + self._transport.__enter__() + for transport in self._mounts.values(): + if transport is not None: + transport.__enter__() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: TracebackType | None = None, + ) -> None: + self._state = ClientState.CLOSED + + self._transport.__exit__(exc_type, exc_value, traceback) + for transport in self._mounts.values(): + if transport is not None: + transport.__exit__(exc_type, exc_value, traceback) + + +class AsyncClient(BaseClient): + """ + An asynchronous HTTP client, with connection pooling, HTTP/2, redirects, + cookie persistence, etc. + + It can be shared between tasks. + + Usage: + + ```python + >>> async with httpx.AsyncClient() as client: + >>> response = await client.get('https://example.org') + ``` + + **Parameters:** + + * **auth** - *(optional)* An authentication class to use when sending + requests. + * **params** - *(optional)* Query parameters to include in request URLs, as + a string, dictionary, or sequence of two-tuples. + * **headers** - *(optional)* Dictionary of HTTP headers to include when + sending requests. + * **cookies** - *(optional)* Dictionary of Cookie items to include when + sending requests. + * **verify** - *(optional)* Either `True` to use an SSL context with the + default CA bundle, `False` to disable verification, or an instance of + `ssl.SSLContext` to use a custom context. + * **http2** - *(optional)* A boolean indicating if HTTP/2 support should be + enabled. Defaults to `False`. + * **proxy** - *(optional)* A proxy URL where all the traffic should be routed. + * **timeout** - *(optional)* The timeout configuration to use when sending + requests. + * **limits** - *(optional)* The limits configuration to use. + * **max_redirects** - *(optional)* The maximum number of redirect responses + that should be followed. + * **base_url** - *(optional)* A URL to use as the base when building + request URLs. + * **transport** - *(optional)* A transport class to use for sending requests + over the network. + * **trust_env** - *(optional)* Enables or disables usage of environment + variables for configuration. + * **default_encoding** - *(optional)* The default encoding to use for decoding + response text, if no charset information is included in a response Content-Type + header. Set to a callable for automatic character set detection. Default: "utf-8". + """ + + def __init__( + self, + *, + auth: AuthTypes | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + http1: bool = True, + http2: bool = False, + proxy: ProxyTypes | None = None, + mounts: None | (typing.Mapping[str, AsyncBaseTransport | None]) = None, + timeout: TimeoutTypes = DEFAULT_TIMEOUT_CONFIG, + follow_redirects: bool = False, + limits: Limits = DEFAULT_LIMITS, + max_redirects: int = DEFAULT_MAX_REDIRECTS, + event_hooks: None | (typing.Mapping[str, list[EventHook]]) = None, + base_url: URL | str = "", + transport: AsyncBaseTransport | None = None, + trust_env: bool = True, + default_encoding: str | typing.Callable[[bytes], str] = "utf-8", + ) -> None: + super().__init__( + auth=auth, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + follow_redirects=follow_redirects, + max_redirects=max_redirects, + event_hooks=event_hooks, + base_url=base_url, + trust_env=trust_env, + default_encoding=default_encoding, + ) + + if http2: + try: + import h2 # noqa + except ImportError: # pragma: no cover + raise ImportError( + "Using http2=True, but the 'h2' package is not installed. " + "Make sure to install httpx using `pip install httpx[http2]`." + ) from None + + allow_env_proxies = trust_env and transport is None + proxy_map = self._get_proxy_map(proxy, allow_env_proxies) + + self._transport = self._init_transport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + transport=transport, + ) + + self._mounts: dict[URLPattern, AsyncBaseTransport | None] = { + URLPattern(key): None + if proxy is None + else self._init_proxy_transport( + proxy, + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + ) + for key, proxy in proxy_map.items() + } + if mounts is not None: + self._mounts.update( + {URLPattern(key): transport for key, transport in mounts.items()} + ) + self._mounts = dict(sorted(self._mounts.items())) + + def _init_transport( + self, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, + http1: bool = True, + http2: bool = False, + limits: Limits = DEFAULT_LIMITS, + transport: AsyncBaseTransport | None = None, + ) -> AsyncBaseTransport: + if transport is not None: + return transport + + return AsyncHTTPTransport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + ) + + def _init_proxy_transport( + self, + proxy: Proxy, + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, + http1: bool = True, + http2: bool = False, + limits: Limits = DEFAULT_LIMITS, + ) -> AsyncBaseTransport: + return AsyncHTTPTransport( + verify=verify, + cert=cert, + trust_env=trust_env, + http1=http1, + http2=http2, + limits=limits, + proxy=proxy, + ) + + def _transport_for_url(self, url: URL) -> AsyncBaseTransport: + """ + Returns the transport instance that should be used for a given URL. + This will either be the standard connection pool, or a proxy. + """ + for pattern, transport in self._mounts.items(): + if pattern.matches(url): + return self._transport if transport is None else transport + + return self._transport + + async def request( + self, + method: str, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Build and send a request. + + Equivalent to: + + ```python + request = client.build_request(...) + response = await client.send(request, ...) + ``` + + See `AsyncClient.build_request()`, `AsyncClient.send()` + and [Merging of configuration][0] for how the various parameters + are merged with client-level configuration. + + [0]: /advanced/clients/#merging-of-configuration + """ + + if cookies is not None: # pragma: no cover + message = ( + "Setting per-request cookies=<...> is being deprecated, because " + "the expected behaviour on cookie persistence is ambiguous. Set " + "cookies directly on the client instance instead." + ) + warnings.warn(message, DeprecationWarning, stacklevel=2) + + request = self.build_request( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + extensions=extensions, + ) + return await self.send(request, auth=auth, follow_redirects=follow_redirects) + + @asynccontextmanager + async def stream( + self, + method: str, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> typing.AsyncIterator[Response]: + """ + Alternative to `httpx.request()` that streams the response body + instead of loading it into memory at once. + + **Parameters**: See `httpx.request`. + + See also: [Streaming Responses][0] + + [0]: /quickstart#streaming-responses + """ + request = self.build_request( + method=method, + url=url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + timeout=timeout, + extensions=extensions, + ) + response = await self.send( + request=request, + auth=auth, + follow_redirects=follow_redirects, + stream=True, + ) + try: + yield response + finally: + await response.aclose() + + async def send( + self, + request: Request, + *, + stream: bool = False, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + ) -> Response: + """ + Send a request. + + The request is sent as-is, unmodified. + + Typically you'll want to build one with `AsyncClient.build_request()` + so that any client-level configuration is merged into the request, + but passing an explicit `httpx.Request()` is supported as well. + + See also: [Request instances][0] + + [0]: /advanced/clients/#request-instances + """ + if self._state == ClientState.CLOSED: + raise RuntimeError("Cannot send a request, as the client has been closed.") + + self._state = ClientState.OPENED + follow_redirects = ( + self.follow_redirects + if isinstance(follow_redirects, UseClientDefault) + else follow_redirects + ) + + self._set_timeout(request) + + auth = self._build_request_auth(request, auth) + + response = await self._send_handling_auth( + request, + auth=auth, + follow_redirects=follow_redirects, + history=[], + ) + try: + if not stream: + await response.aread() + + return response + + except BaseException as exc: + await response.aclose() + raise exc + + async def _send_handling_auth( + self, + request: Request, + auth: Auth, + follow_redirects: bool, + history: list[Response], + ) -> Response: + auth_flow = auth.async_auth_flow(request) + try: + request = await auth_flow.__anext__() + + while True: + response = await self._send_handling_redirects( + request, + follow_redirects=follow_redirects, + history=history, + ) + try: + try: + next_request = await auth_flow.asend(response) + except StopAsyncIteration: + return response + + response.history = list(history) + await response.aread() + request = next_request + history.append(response) + + except BaseException as exc: + await response.aclose() + raise exc + finally: + await auth_flow.aclose() + + async def _send_handling_redirects( + self, + request: Request, + follow_redirects: bool, + history: list[Response], + ) -> Response: + while True: + if len(history) > self.max_redirects: + raise TooManyRedirects( + "Exceeded maximum allowed redirects.", request=request + ) + + for hook in self._event_hooks["request"]: + await hook(request) + + response = await self._send_single_request(request) + try: + for hook in self._event_hooks["response"]: + await hook(response) + + response.history = list(history) + + if not response.has_redirect_location: + return response + + request = self._build_redirect_request(request, response) + history = history + [response] + + if follow_redirects: + await response.aread() + else: + response.next_request = request + return response + + except BaseException as exc: + await response.aclose() + raise exc + + async def _send_single_request(self, request: Request) -> Response: + """ + Sends a single request, without handling any redirections. + """ + transport = self._transport_for_url(request.url) + start = time.perf_counter() + + if not isinstance(request.stream, AsyncByteStream): + raise RuntimeError( + "Attempted to send an sync request with an AsyncClient instance." + ) + + with request_context(request=request): + response = await transport.handle_async_request(request) + + assert isinstance(response.stream, AsyncByteStream) + response.request = request + response.stream = BoundAsyncStream( + response.stream, response=response, start=start + ) + self.cookies.extract_cookies(response) + response.default_encoding = self._default_encoding + + logger.info( + 'HTTP Request: %s %s "%s %d %s"', + request.method, + request.url, + response.http_version, + response.status_code, + response.reason_phrase, + ) + + return response + + async def get( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault | None = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `GET` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "GET", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def options( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send an `OPTIONS` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "OPTIONS", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def head( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `HEAD` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "HEAD", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def post( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `POST` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "POST", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def put( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `PUT` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "PUT", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def patch( + self, + url: URL | str, + *, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `PATCH` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "PATCH", + url, + content=content, + data=data, + files=files, + json=json, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def delete( + self, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT, + follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT, + timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + extensions: RequestExtensions | None = None, + ) -> Response: + """ + Send a `DELETE` request. + + **Parameters**: See `httpx.request`. + """ + return await self.request( + "DELETE", + url, + params=params, + headers=headers, + cookies=cookies, + auth=auth, + follow_redirects=follow_redirects, + timeout=timeout, + extensions=extensions, + ) + + async def aclose(self) -> None: + """ + Close transport and proxies. + """ + if self._state != ClientState.CLOSED: + self._state = ClientState.CLOSED + + await self._transport.aclose() + for proxy in self._mounts.values(): + if proxy is not None: + await proxy.aclose() + + async def __aenter__(self: U) -> U: + if self._state != ClientState.UNOPENED: + msg = { + ClientState.OPENED: "Cannot open a client instance more than once.", + ClientState.CLOSED: ( + "Cannot reopen a client instance, once it has been closed." + ), + }[self._state] + raise RuntimeError(msg) + + self._state = ClientState.OPENED + + await self._transport.__aenter__() + for proxy in self._mounts.values(): + if proxy is not None: + await proxy.__aenter__() + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: TracebackType | None = None, + ) -> None: + self._state = ClientState.CLOSED + + await self._transport.__aexit__(exc_type, exc_value, traceback) + for proxy in self._mounts.values(): + if proxy is not None: + await proxy.__aexit__(exc_type, exc_value, traceback) diff --git a/env/lib/python3.13/site-packages/httpx/_config.py b/env/lib/python3.13/site-packages/httpx/_config.py new file mode 100644 index 0000000000000000000000000000000000000000..467a6c90ae269babe3af7963d9d7c78b9f012268 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_config.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +import os +import typing + +from ._models import Headers +from ._types import CertTypes, HeaderTypes, TimeoutTypes +from ._urls import URL + +if typing.TYPE_CHECKING: + import ssl # pragma: no cover + +__all__ = ["Limits", "Proxy", "Timeout", "create_ssl_context"] + + +class UnsetType: + pass # pragma: no cover + + +UNSET = UnsetType() + + +def create_ssl_context( + verify: ssl.SSLContext | str | bool = True, + cert: CertTypes | None = None, + trust_env: bool = True, +) -> ssl.SSLContext: + import ssl + import warnings + + import certifi + + if verify is True: + if trust_env and os.environ.get("SSL_CERT_FILE"): # pragma: nocover + ctx = ssl.create_default_context(cafile=os.environ["SSL_CERT_FILE"]) + elif trust_env and os.environ.get("SSL_CERT_DIR"): # pragma: nocover + ctx = ssl.create_default_context(capath=os.environ["SSL_CERT_DIR"]) + else: + # Default case... + ctx = ssl.create_default_context(cafile=certifi.where()) + elif verify is False: + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + elif isinstance(verify, str): # pragma: nocover + message = ( + "`verify=` is deprecated. " + "Use `verify=ssl.create_default_context(cafile=...)` " + "or `verify=ssl.create_default_context(capath=...)` instead." + ) + warnings.warn(message, DeprecationWarning) + if os.path.isdir(verify): + return ssl.create_default_context(capath=verify) + return ssl.create_default_context(cafile=verify) + else: + ctx = verify + + if cert: # pragma: nocover + message = ( + "`cert=...` is deprecated. Use `verify=` instead," + "with `.load_cert_chain()` to configure the certificate chain." + ) + warnings.warn(message, DeprecationWarning) + if isinstance(cert, str): + ctx.load_cert_chain(cert) + else: + ctx.load_cert_chain(*cert) + + return ctx + + +class Timeout: + """ + Timeout configuration. + + **Usage**: + + Timeout(None) # No timeouts. + Timeout(5.0) # 5s timeout on all operations. + Timeout(None, connect=5.0) # 5s timeout on connect, no other timeouts. + Timeout(5.0, connect=10.0) # 10s timeout on connect. 5s timeout elsewhere. + Timeout(5.0, pool=None) # No timeout on acquiring connection from pool. + # 5s timeout elsewhere. + """ + + def __init__( + self, + timeout: TimeoutTypes | UnsetType = UNSET, + *, + connect: None | float | UnsetType = UNSET, + read: None | float | UnsetType = UNSET, + write: None | float | UnsetType = UNSET, + pool: None | float | UnsetType = UNSET, + ) -> None: + if isinstance(timeout, Timeout): + # Passed as a single explicit Timeout. + assert connect is UNSET + assert read is UNSET + assert write is UNSET + assert pool is UNSET + self.connect = timeout.connect # type: typing.Optional[float] + self.read = timeout.read # type: typing.Optional[float] + self.write = timeout.write # type: typing.Optional[float] + self.pool = timeout.pool # type: typing.Optional[float] + elif isinstance(timeout, tuple): + # Passed as a tuple. + self.connect = timeout[0] + self.read = timeout[1] + self.write = None if len(timeout) < 3 else timeout[2] + self.pool = None if len(timeout) < 4 else timeout[3] + elif not ( + isinstance(connect, UnsetType) + or isinstance(read, UnsetType) + or isinstance(write, UnsetType) + or isinstance(pool, UnsetType) + ): + self.connect = connect + self.read = read + self.write = write + self.pool = pool + else: + if isinstance(timeout, UnsetType): + raise ValueError( + "httpx.Timeout must either include a default, or set all " + "four parameters explicitly." + ) + self.connect = timeout if isinstance(connect, UnsetType) else connect + self.read = timeout if isinstance(read, UnsetType) else read + self.write = timeout if isinstance(write, UnsetType) else write + self.pool = timeout if isinstance(pool, UnsetType) else pool + + def as_dict(self) -> dict[str, float | None]: + return { + "connect": self.connect, + "read": self.read, + "write": self.write, + "pool": self.pool, + } + + def __eq__(self, other: typing.Any) -> bool: + return ( + isinstance(other, self.__class__) + and self.connect == other.connect + and self.read == other.read + and self.write == other.write + and self.pool == other.pool + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + if len({self.connect, self.read, self.write, self.pool}) == 1: + return f"{class_name}(timeout={self.connect})" + return ( + f"{class_name}(connect={self.connect}, " + f"read={self.read}, write={self.write}, pool={self.pool})" + ) + + +class Limits: + """ + Configuration for limits to various client behaviors. + + **Parameters:** + + * **max_connections** - The maximum number of concurrent connections that may be + established. + * **max_keepalive_connections** - Allow the connection pool to maintain + keep-alive connections below this point. Should be less than or equal + to `max_connections`. + * **keepalive_expiry** - Time limit on idle keep-alive connections in seconds. + """ + + def __init__( + self, + *, + max_connections: int | None = None, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = 5.0, + ) -> None: + self.max_connections = max_connections + self.max_keepalive_connections = max_keepalive_connections + self.keepalive_expiry = keepalive_expiry + + def __eq__(self, other: typing.Any) -> bool: + return ( + isinstance(other, self.__class__) + and self.max_connections == other.max_connections + and self.max_keepalive_connections == other.max_keepalive_connections + and self.keepalive_expiry == other.keepalive_expiry + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return ( + f"{class_name}(max_connections={self.max_connections}, " + f"max_keepalive_connections={self.max_keepalive_connections}, " + f"keepalive_expiry={self.keepalive_expiry})" + ) + + +class Proxy: + def __init__( + self, + url: URL | str, + *, + ssl_context: ssl.SSLContext | None = None, + auth: tuple[str, str] | None = None, + headers: HeaderTypes | None = None, + ) -> None: + url = URL(url) + headers = Headers(headers) + + if url.scheme not in ("http", "https", "socks5", "socks5h"): + raise ValueError(f"Unknown scheme for proxy URL {url!r}") + + if url.username or url.password: + # Remove any auth credentials from the URL. + auth = (url.username, url.password) + url = url.copy_with(username=None, password=None) + + self.url = url + self.auth = auth + self.headers = headers + self.ssl_context = ssl_context + + @property + def raw_auth(self) -> tuple[bytes, bytes] | None: + # The proxy authentication as raw bytes. + return ( + None + if self.auth is None + else (self.auth[0].encode("utf-8"), self.auth[1].encode("utf-8")) + ) + + def __repr__(self) -> str: + # The authentication is represented with the password component masked. + auth = (self.auth[0], "********") if self.auth else None + + # Build a nice concise representation. + url_str = f"{str(self.url)!r}" + auth_str = f", auth={auth!r}" if auth else "" + headers_str = f", headers={dict(self.headers)!r}" if self.headers else "" + return f"Proxy({url_str}{auth_str}{headers_str})" + + +DEFAULT_TIMEOUT_CONFIG = Timeout(timeout=5.0) +DEFAULT_LIMITS = Limits(max_connections=100, max_keepalive_connections=20) +DEFAULT_MAX_REDIRECTS = 20 diff --git a/env/lib/python3.13/site-packages/httpx/_content.py b/env/lib/python3.13/site-packages/httpx/_content.py new file mode 100644 index 0000000000000000000000000000000000000000..6f479a0885f723b7395843d41164a87041820776 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_content.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import inspect +import warnings +from json import dumps as json_dumps +from typing import ( + Any, + AsyncIterable, + AsyncIterator, + Iterable, + Iterator, + Mapping, +) +from urllib.parse import urlencode + +from ._exceptions import StreamClosed, StreamConsumed +from ._multipart import MultipartStream +from ._types import ( + AsyncByteStream, + RequestContent, + RequestData, + RequestFiles, + ResponseContent, + SyncByteStream, +) +from ._utils import peek_filelike_length, primitive_value_to_str + +__all__ = ["ByteStream"] + + +class ByteStream(AsyncByteStream, SyncByteStream): + def __init__(self, stream: bytes) -> None: + self._stream = stream + + def __iter__(self) -> Iterator[bytes]: + yield self._stream + + async def __aiter__(self) -> AsyncIterator[bytes]: + yield self._stream + + +class IteratorByteStream(SyncByteStream): + CHUNK_SIZE = 65_536 + + def __init__(self, stream: Iterable[bytes]) -> None: + self._stream = stream + self._is_stream_consumed = False + self._is_generator = inspect.isgenerator(stream) + + def __iter__(self) -> Iterator[bytes]: + if self._is_stream_consumed and self._is_generator: + raise StreamConsumed() + + self._is_stream_consumed = True + if hasattr(self._stream, "read"): + # File-like interfaces should use 'read' directly. + chunk = self._stream.read(self.CHUNK_SIZE) + while chunk: + yield chunk + chunk = self._stream.read(self.CHUNK_SIZE) + else: + # Otherwise iterate. + for part in self._stream: + yield part + + +class AsyncIteratorByteStream(AsyncByteStream): + CHUNK_SIZE = 65_536 + + def __init__(self, stream: AsyncIterable[bytes]) -> None: + self._stream = stream + self._is_stream_consumed = False + self._is_generator = inspect.isasyncgen(stream) + + async def __aiter__(self) -> AsyncIterator[bytes]: + if self._is_stream_consumed and self._is_generator: + raise StreamConsumed() + + self._is_stream_consumed = True + if hasattr(self._stream, "aread"): + # File-like interfaces should use 'aread' directly. + chunk = await self._stream.aread(self.CHUNK_SIZE) + while chunk: + yield chunk + chunk = await self._stream.aread(self.CHUNK_SIZE) + else: + # Otherwise iterate. + async for part in self._stream: + yield part + + +class UnattachedStream(AsyncByteStream, SyncByteStream): + """ + If a request or response is serialized using pickle, then it is no longer + attached to a stream for I/O purposes. Any stream operations should result + in `httpx.StreamClosed`. + """ + + def __iter__(self) -> Iterator[bytes]: + raise StreamClosed() + + async def __aiter__(self) -> AsyncIterator[bytes]: + raise StreamClosed() + yield b"" # pragma: no cover + + +def encode_content( + content: str | bytes | Iterable[bytes] | AsyncIterable[bytes], +) -> tuple[dict[str, str], SyncByteStream | AsyncByteStream]: + if isinstance(content, (bytes, str)): + body = content.encode("utf-8") if isinstance(content, str) else content + content_length = len(body) + headers = {"Content-Length": str(content_length)} if body else {} + return headers, ByteStream(body) + + elif isinstance(content, Iterable) and not isinstance(content, dict): + # `not isinstance(content, dict)` is a bit oddly specific, but it + # catches a case that's easy for users to make in error, and would + # otherwise pass through here, like any other bytes-iterable, + # because `dict` happens to be iterable. See issue #2491. + content_length_or_none = peek_filelike_length(content) + + if content_length_or_none is None: + headers = {"Transfer-Encoding": "chunked"} + else: + headers = {"Content-Length": str(content_length_or_none)} + return headers, IteratorByteStream(content) # type: ignore + + elif isinstance(content, AsyncIterable): + headers = {"Transfer-Encoding": "chunked"} + return headers, AsyncIteratorByteStream(content) + + raise TypeError(f"Unexpected type for 'content', {type(content)!r}") + + +def encode_urlencoded_data( + data: RequestData, +) -> tuple[dict[str, str], ByteStream]: + plain_data = [] + for key, value in data.items(): + if isinstance(value, (list, tuple)): + plain_data.extend([(key, primitive_value_to_str(item)) for item in value]) + else: + plain_data.append((key, primitive_value_to_str(value))) + body = urlencode(plain_data, doseq=True).encode("utf-8") + content_length = str(len(body)) + content_type = "application/x-www-form-urlencoded" + headers = {"Content-Length": content_length, "Content-Type": content_type} + return headers, ByteStream(body) + + +def encode_multipart_data( + data: RequestData, files: RequestFiles, boundary: bytes | None +) -> tuple[dict[str, str], MultipartStream]: + multipart = MultipartStream(data=data, files=files, boundary=boundary) + headers = multipart.get_headers() + return headers, multipart + + +def encode_text(text: str) -> tuple[dict[str, str], ByteStream]: + body = text.encode("utf-8") + content_length = str(len(body)) + content_type = "text/plain; charset=utf-8" + headers = {"Content-Length": content_length, "Content-Type": content_type} + return headers, ByteStream(body) + + +def encode_html(html: str) -> tuple[dict[str, str], ByteStream]: + body = html.encode("utf-8") + content_length = str(len(body)) + content_type = "text/html; charset=utf-8" + headers = {"Content-Length": content_length, "Content-Type": content_type} + return headers, ByteStream(body) + + +def encode_json(json: Any) -> tuple[dict[str, str], ByteStream]: + body = json_dumps( + json, ensure_ascii=False, separators=(",", ":"), allow_nan=False + ).encode("utf-8") + content_length = str(len(body)) + content_type = "application/json" + headers = {"Content-Length": content_length, "Content-Type": content_type} + return headers, ByteStream(body) + + +def encode_request( + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: Any | None = None, + boundary: bytes | None = None, +) -> tuple[dict[str, str], SyncByteStream | AsyncByteStream]: + """ + Handles encoding the given `content`, `data`, `files`, and `json`, + returning a two-tuple of (, ). + """ + if data is not None and not isinstance(data, Mapping): + # We prefer to separate `content=` + # for raw request content, and `data=
` for url encoded or + # multipart form content. + # + # However for compat with requests, we *do* still support + # `data=` usages. We deal with that case here, treating it + # as if `content=<...>` had been supplied instead. + message = "Use 'content=<...>' to upload raw bytes/text content." + warnings.warn(message, DeprecationWarning, stacklevel=2) + return encode_content(data) + + if content is not None: + return encode_content(content) + elif files: + return encode_multipart_data(data or {}, files, boundary) + elif data: + return encode_urlencoded_data(data) + elif json is not None: + return encode_json(json) + + return {}, ByteStream(b"") + + +def encode_response( + content: ResponseContent | None = None, + text: str | None = None, + html: str | None = None, + json: Any | None = None, +) -> tuple[dict[str, str], SyncByteStream | AsyncByteStream]: + """ + Handles encoding the given `content`, returning a two-tuple of + (, ). + """ + if content is not None: + return encode_content(content) + elif text is not None: + return encode_text(text) + elif html is not None: + return encode_html(html) + elif json is not None: + return encode_json(json) + + return {}, ByteStream(b"") diff --git a/env/lib/python3.13/site-packages/httpx/_decoders.py b/env/lib/python3.13/site-packages/httpx/_decoders.py new file mode 100644 index 0000000000000000000000000000000000000000..899dfada878e1181fca6d3c75a79526a076abb9e --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_decoders.py @@ -0,0 +1,393 @@ +""" +Handlers for Content-Encoding. + +See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding +""" + +from __future__ import annotations + +import codecs +import io +import typing +import zlib + +from ._exceptions import DecodingError + +# Brotli support is optional +try: + # The C bindings in `brotli` are recommended for CPython. + import brotli +except ImportError: # pragma: no cover + try: + # The CFFI bindings in `brotlicffi` are recommended for PyPy + # and other environments. + import brotlicffi as brotli + except ImportError: + brotli = None + + +# Zstandard support is optional +try: + import zstandard +except ImportError: # pragma: no cover + zstandard = None # type: ignore + + +class ContentDecoder: + def decode(self, data: bytes) -> bytes: + raise NotImplementedError() # pragma: no cover + + def flush(self) -> bytes: + raise NotImplementedError() # pragma: no cover + + +class IdentityDecoder(ContentDecoder): + """ + Handle unencoded data. + """ + + def decode(self, data: bytes) -> bytes: + return data + + def flush(self) -> bytes: + return b"" + + +class DeflateDecoder(ContentDecoder): + """ + Handle 'deflate' decoding. + + See: https://stackoverflow.com/questions/1838699 + """ + + def __init__(self) -> None: + self.first_attempt = True + self.decompressor = zlib.decompressobj() + + def decode(self, data: bytes) -> bytes: + was_first_attempt = self.first_attempt + self.first_attempt = False + try: + return self.decompressor.decompress(data) + except zlib.error as exc: + if was_first_attempt: + self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + return self.decode(data) + raise DecodingError(str(exc)) from exc + + def flush(self) -> bytes: + try: + return self.decompressor.flush() + except zlib.error as exc: # pragma: no cover + raise DecodingError(str(exc)) from exc + + +class GZipDecoder(ContentDecoder): + """ + Handle 'gzip' decoding. + + See: https://stackoverflow.com/questions/1838699 + """ + + def __init__(self) -> None: + self.decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) + + def decode(self, data: bytes) -> bytes: + try: + return self.decompressor.decompress(data) + except zlib.error as exc: + raise DecodingError(str(exc)) from exc + + def flush(self) -> bytes: + try: + return self.decompressor.flush() + except zlib.error as exc: # pragma: no cover + raise DecodingError(str(exc)) from exc + + +class BrotliDecoder(ContentDecoder): + """ + Handle 'brotli' decoding. + + Requires `pip install brotlipy`. See: https://brotlipy.readthedocs.io/ + or `pip install brotli`. See https://github.com/google/brotli + Supports both 'brotlipy' and 'Brotli' packages since they share an import + name. The top branches are for 'brotlipy' and bottom branches for 'Brotli' + """ + + def __init__(self) -> None: + if brotli is None: # pragma: no cover + raise ImportError( + "Using 'BrotliDecoder', but neither of the 'brotlicffi' or 'brotli' " + "packages have been installed. " + "Make sure to install httpx using `pip install httpx[brotli]`." + ) from None + + self.decompressor = brotli.Decompressor() + self.seen_data = False + self._decompress: typing.Callable[[bytes], bytes] + if hasattr(self.decompressor, "decompress"): + # The 'brotlicffi' package. + self._decompress = self.decompressor.decompress # pragma: no cover + else: + # The 'brotli' package. + self._decompress = self.decompressor.process # pragma: no cover + + def decode(self, data: bytes) -> bytes: + if not data: + return b"" + self.seen_data = True + try: + return self._decompress(data) + except brotli.error as exc: + raise DecodingError(str(exc)) from exc + + def flush(self) -> bytes: + if not self.seen_data: + return b"" + try: + if hasattr(self.decompressor, "finish"): + # Only available in the 'brotlicffi' package. + + # As the decompressor decompresses eagerly, this + # will never actually emit any data. However, it will potentially throw + # errors if a truncated or damaged data stream has been used. + self.decompressor.finish() # pragma: no cover + return b"" + except brotli.error as exc: # pragma: no cover + raise DecodingError(str(exc)) from exc + + +class ZStandardDecoder(ContentDecoder): + """ + Handle 'zstd' RFC 8878 decoding. + + Requires `pip install zstandard`. + Can be installed as a dependency of httpx using `pip install httpx[zstd]`. + """ + + # inspired by the ZstdDecoder implementation in urllib3 + def __init__(self) -> None: + if zstandard is None: # pragma: no cover + raise ImportError( + "Using 'ZStandardDecoder', ..." + "Make sure to install httpx using `pip install httpx[zstd]`." + ) from None + + self.decompressor = zstandard.ZstdDecompressor().decompressobj() + self.seen_data = False + + def decode(self, data: bytes) -> bytes: + assert zstandard is not None + self.seen_data = True + output = io.BytesIO() + try: + output.write(self.decompressor.decompress(data)) + while self.decompressor.eof and self.decompressor.unused_data: + unused_data = self.decompressor.unused_data + self.decompressor = zstandard.ZstdDecompressor().decompressobj() + output.write(self.decompressor.decompress(unused_data)) + except zstandard.ZstdError as exc: + raise DecodingError(str(exc)) from exc + return output.getvalue() + + def flush(self) -> bytes: + if not self.seen_data: + return b"" + ret = self.decompressor.flush() # note: this is a no-op + if not self.decompressor.eof: + raise DecodingError("Zstandard data is incomplete") # pragma: no cover + return bytes(ret) + + +class MultiDecoder(ContentDecoder): + """ + Handle the case where multiple encodings have been applied. + """ + + def __init__(self, children: typing.Sequence[ContentDecoder]) -> None: + """ + 'children' should be a sequence of decoders in the order in which + each was applied. + """ + # Note that we reverse the order for decoding. + self.children = list(reversed(children)) + + def decode(self, data: bytes) -> bytes: + for child in self.children: + data = child.decode(data) + return data + + def flush(self) -> bytes: + data = b"" + for child in self.children: + data = child.decode(data) + child.flush() + return data + + +class ByteChunker: + """ + Handles returning byte content in fixed-size chunks. + """ + + def __init__(self, chunk_size: int | None = None) -> None: + self._buffer = io.BytesIO() + self._chunk_size = chunk_size + + def decode(self, content: bytes) -> list[bytes]: + if self._chunk_size is None: + return [content] if content else [] + + self._buffer.write(content) + if self._buffer.tell() >= self._chunk_size: + value = self._buffer.getvalue() + chunks = [ + value[i : i + self._chunk_size] + for i in range(0, len(value), self._chunk_size) + ] + if len(chunks[-1]) == self._chunk_size: + self._buffer.seek(0) + self._buffer.truncate() + return chunks + else: + self._buffer.seek(0) + self._buffer.write(chunks[-1]) + self._buffer.truncate() + return chunks[:-1] + else: + return [] + + def flush(self) -> list[bytes]: + value = self._buffer.getvalue() + self._buffer.seek(0) + self._buffer.truncate() + return [value] if value else [] + + +class TextChunker: + """ + Handles returning text content in fixed-size chunks. + """ + + def __init__(self, chunk_size: int | None = None) -> None: + self._buffer = io.StringIO() + self._chunk_size = chunk_size + + def decode(self, content: str) -> list[str]: + if self._chunk_size is None: + return [content] if content else [] + + self._buffer.write(content) + if self._buffer.tell() >= self._chunk_size: + value = self._buffer.getvalue() + chunks = [ + value[i : i + self._chunk_size] + for i in range(0, len(value), self._chunk_size) + ] + if len(chunks[-1]) == self._chunk_size: + self._buffer.seek(0) + self._buffer.truncate() + return chunks + else: + self._buffer.seek(0) + self._buffer.write(chunks[-1]) + self._buffer.truncate() + return chunks[:-1] + else: + return [] + + def flush(self) -> list[str]: + value = self._buffer.getvalue() + self._buffer.seek(0) + self._buffer.truncate() + return [value] if value else [] + + +class TextDecoder: + """ + Handles incrementally decoding bytes into text + """ + + def __init__(self, encoding: str = "utf-8") -> None: + self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace") + + def decode(self, data: bytes) -> str: + return self.decoder.decode(data) + + def flush(self) -> str: + return self.decoder.decode(b"", True) + + +class LineDecoder: + """ + Handles incrementally reading lines from text. + + Has the same behaviour as the stdllib splitlines, + but handling the input iteratively. + """ + + def __init__(self) -> None: + self.buffer: list[str] = [] + self.trailing_cr: bool = False + + def decode(self, text: str) -> list[str]: + # See https://docs.python.org/3/library/stdtypes.html#str.splitlines + NEWLINE_CHARS = "\n\r\x0b\x0c\x1c\x1d\x1e\x85\u2028\u2029" + + # We always push a trailing `\r` into the next decode iteration. + if self.trailing_cr: + text = "\r" + text + self.trailing_cr = False + if text.endswith("\r"): + self.trailing_cr = True + text = text[:-1] + + if not text: + # NOTE: the edge case input of empty text doesn't occur in practice, + # because other httpx internals filter out this value + return [] # pragma: no cover + + trailing_newline = text[-1] in NEWLINE_CHARS + lines = text.splitlines() + + if len(lines) == 1 and not trailing_newline: + # No new lines, buffer the input and continue. + self.buffer.append(lines[0]) + return [] + + if self.buffer: + # Include any existing buffer in the first portion of the + # splitlines result. + lines = ["".join(self.buffer) + lines[0]] + lines[1:] + self.buffer = [] + + if not trailing_newline: + # If the last segment of splitlines is not newline terminated, + # then drop it from our output and start a new buffer. + self.buffer = [lines.pop()] + + return lines + + def flush(self) -> list[str]: + if not self.buffer and not self.trailing_cr: + return [] + + lines = ["".join(self.buffer)] + self.buffer = [] + self.trailing_cr = False + return lines + + +SUPPORTED_DECODERS = { + "identity": IdentityDecoder, + "gzip": GZipDecoder, + "deflate": DeflateDecoder, + "br": BrotliDecoder, + "zstd": ZStandardDecoder, +} + + +if brotli is None: + SUPPORTED_DECODERS.pop("br") # pragma: no cover +if zstandard is None: + SUPPORTED_DECODERS.pop("zstd") # pragma: no cover diff --git a/env/lib/python3.13/site-packages/httpx/_exceptions.py b/env/lib/python3.13/site-packages/httpx/_exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..77f45a6d3986d15626fc8a5fd459d6a3e0fbe466 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_exceptions.py @@ -0,0 +1,379 @@ +""" +Our exception hierarchy: + +* HTTPError + x RequestError + + TransportError + - TimeoutException + · ConnectTimeout + · ReadTimeout + · WriteTimeout + · PoolTimeout + - NetworkError + · ConnectError + · ReadError + · WriteError + · CloseError + - ProtocolError + · LocalProtocolError + · RemoteProtocolError + - ProxyError + - UnsupportedProtocol + + DecodingError + + TooManyRedirects + x HTTPStatusError +* InvalidURL +* CookieConflict +* StreamError + x StreamConsumed + x StreamClosed + x ResponseNotRead + x RequestNotRead +""" + +from __future__ import annotations + +import contextlib +import typing + +if typing.TYPE_CHECKING: + from ._models import Request, Response # pragma: no cover + +__all__ = [ + "CloseError", + "ConnectError", + "ConnectTimeout", + "CookieConflict", + "DecodingError", + "HTTPError", + "HTTPStatusError", + "InvalidURL", + "LocalProtocolError", + "NetworkError", + "PoolTimeout", + "ProtocolError", + "ProxyError", + "ReadError", + "ReadTimeout", + "RemoteProtocolError", + "RequestError", + "RequestNotRead", + "ResponseNotRead", + "StreamClosed", + "StreamConsumed", + "StreamError", + "TimeoutException", + "TooManyRedirects", + "TransportError", + "UnsupportedProtocol", + "WriteError", + "WriteTimeout", +] + + +class HTTPError(Exception): + """ + Base class for `RequestError` and `HTTPStatusError`. + + Useful for `try...except` blocks when issuing a request, + and then calling `.raise_for_status()`. + + For example: + + ``` + try: + response = httpx.get("https://www.example.com") + response.raise_for_status() + except httpx.HTTPError as exc: + print(f"HTTP Exception for {exc.request.url} - {exc}") + ``` + """ + + def __init__(self, message: str) -> None: + super().__init__(message) + self._request: Request | None = None + + @property + def request(self) -> Request: + if self._request is None: + raise RuntimeError("The .request property has not been set.") + return self._request + + @request.setter + def request(self, request: Request) -> None: + self._request = request + + +class RequestError(HTTPError): + """ + Base class for all exceptions that may occur when issuing a `.request()`. + """ + + def __init__(self, message: str, *, request: Request | None = None) -> None: + super().__init__(message) + # At the point an exception is raised we won't typically have a request + # instance to associate it with. + # + # The 'request_context' context manager is used within the Client and + # Response methods in order to ensure that any raised exceptions + # have a `.request` property set on them. + self._request = request + + +class TransportError(RequestError): + """ + Base class for all exceptions that occur at the level of the Transport API. + """ + + +# Timeout exceptions... + + +class TimeoutException(TransportError): + """ + The base class for timeout errors. + + An operation has timed out. + """ + + +class ConnectTimeout(TimeoutException): + """ + Timed out while connecting to the host. + """ + + +class ReadTimeout(TimeoutException): + """ + Timed out while receiving data from the host. + """ + + +class WriteTimeout(TimeoutException): + """ + Timed out while sending data to the host. + """ + + +class PoolTimeout(TimeoutException): + """ + Timed out waiting to acquire a connection from the pool. + """ + + +# Core networking exceptions... + + +class NetworkError(TransportError): + """ + The base class for network-related errors. + + An error occurred while interacting with the network. + """ + + +class ReadError(NetworkError): + """ + Failed to receive data from the network. + """ + + +class WriteError(NetworkError): + """ + Failed to send data through the network. + """ + + +class ConnectError(NetworkError): + """ + Failed to establish a connection. + """ + + +class CloseError(NetworkError): + """ + Failed to close a connection. + """ + + +# Other transport exceptions... + + +class ProxyError(TransportError): + """ + An error occurred while establishing a proxy connection. + """ + + +class UnsupportedProtocol(TransportError): + """ + Attempted to make a request to an unsupported protocol. + + For example issuing a request to `ftp://www.example.com`. + """ + + +class ProtocolError(TransportError): + """ + The protocol was violated. + """ + + +class LocalProtocolError(ProtocolError): + """ + A protocol was violated by the client. + + For example if the user instantiated a `Request` instance explicitly, + failed to include the mandatory `Host:` header, and then issued it directly + using `client.send()`. + """ + + +class RemoteProtocolError(ProtocolError): + """ + The protocol was violated by the server. + + For example, returning malformed HTTP. + """ + + +# Other request exceptions... + + +class DecodingError(RequestError): + """ + Decoding of the response failed, due to a malformed encoding. + """ + + +class TooManyRedirects(RequestError): + """ + Too many redirects. + """ + + +# Client errors + + +class HTTPStatusError(HTTPError): + """ + The response had an error HTTP status of 4xx or 5xx. + + May be raised when calling `response.raise_for_status()` + """ + + def __init__(self, message: str, *, request: Request, response: Response) -> None: + super().__init__(message) + self.request = request + self.response = response + + +class InvalidURL(Exception): + """ + URL is improperly formed or cannot be parsed. + """ + + def __init__(self, message: str) -> None: + super().__init__(message) + + +class CookieConflict(Exception): + """ + Attempted to lookup a cookie by name, but multiple cookies existed. + + Can occur when calling `response.cookies.get(...)`. + """ + + def __init__(self, message: str) -> None: + super().__init__(message) + + +# Stream exceptions... + +# These may occur as the result of a programming error, by accessing +# the request/response stream in an invalid manner. + + +class StreamError(RuntimeError): + """ + The base class for stream exceptions. + + The developer made an error in accessing the request stream in + an invalid way. + """ + + def __init__(self, message: str) -> None: + super().__init__(message) + + +class StreamConsumed(StreamError): + """ + Attempted to read or stream content, but the content has already + been streamed. + """ + + def __init__(self) -> None: + message = ( + "Attempted to read or stream some content, but the content has " + "already been streamed. For requests, this could be due to passing " + "a generator as request content, and then receiving a redirect " + "response or a secondary request as part of an authentication flow." + "For responses, this could be due to attempting to stream the response " + "content more than once." + ) + super().__init__(message) + + +class StreamClosed(StreamError): + """ + Attempted to read or stream response content, but the request has been + closed. + """ + + def __init__(self) -> None: + message = ( + "Attempted to read or stream content, but the stream has " "been closed." + ) + super().__init__(message) + + +class ResponseNotRead(StreamError): + """ + Attempted to access streaming response content, without having called `read()`. + """ + + def __init__(self) -> None: + message = ( + "Attempted to access streaming response content," + " without having called `read()`." + ) + super().__init__(message) + + +class RequestNotRead(StreamError): + """ + Attempted to access streaming request content, without having called `read()`. + """ + + def __init__(self) -> None: + message = ( + "Attempted to access streaming request content," + " without having called `read()`." + ) + super().__init__(message) + + +@contextlib.contextmanager +def request_context( + request: Request | None = None, +) -> typing.Iterator[None]: + """ + A context manager that can be used to attach the given request context + to any `RequestError` exceptions that are raised within the block. + """ + try: + yield + except RequestError as exc: + if request is not None: + exc.request = request + raise exc diff --git a/env/lib/python3.13/site-packages/httpx/_main.py b/env/lib/python3.13/site-packages/httpx/_main.py new file mode 100644 index 0000000000000000000000000000000000000000..cffa4bb7db0f930f4db56653a061c4d7400ba4e6 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_main.py @@ -0,0 +1,506 @@ +from __future__ import annotations + +import functools +import json +import sys +import typing + +import click +import pygments.lexers +import pygments.util +import rich.console +import rich.markup +import rich.progress +import rich.syntax +import rich.table + +from ._client import Client +from ._exceptions import RequestError +from ._models import Response +from ._status_codes import codes + +if typing.TYPE_CHECKING: + import httpcore # pragma: no cover + + +def print_help() -> None: + console = rich.console.Console() + + console.print("[bold]HTTPX :butterfly:", justify="center") + console.print() + console.print("A next generation HTTP client.", justify="center") + console.print() + console.print( + "Usage: [bold]httpx[/bold] [cyan] [OPTIONS][/cyan] ", justify="left" + ) + console.print() + + table = rich.table.Table.grid(padding=1, pad_edge=True) + table.add_column("Parameter", no_wrap=True, justify="left", style="bold") + table.add_column("Description") + table.add_row( + "-m, --method [cyan]METHOD", + "Request method, such as GET, POST, PUT, PATCH, DELETE, OPTIONS, HEAD.\n" + "[Default: GET, or POST if a request body is included]", + ) + table.add_row( + "-p, --params [cyan] ...", + "Query parameters to include in the request URL.", + ) + table.add_row( + "-c, --content [cyan]TEXT", "Byte content to include in the request body." + ) + table.add_row( + "-d, --data [cyan] ...", "Form data to include in the request body." + ) + table.add_row( + "-f, --files [cyan] ...", + "Form files to include in the request body.", + ) + table.add_row("-j, --json [cyan]TEXT", "JSON data to include in the request body.") + table.add_row( + "-h, --headers [cyan] ...", + "Include additional HTTP headers in the request.", + ) + table.add_row( + "--cookies [cyan] ...", "Cookies to include in the request." + ) + table.add_row( + "--auth [cyan]", + "Username and password to include in the request. Specify '-' for the password" + " to use a password prompt. Note that using --verbose/-v will expose" + " the Authorization header, including the password encoding" + " in a trivially reversible format.", + ) + + table.add_row( + "--proxy [cyan]URL", + "Send the request via a proxy. Should be the URL giving the proxy address.", + ) + + table.add_row( + "--timeout [cyan]FLOAT", + "Timeout value to use for network operations, such as establishing the" + " connection, reading some data, etc... [Default: 5.0]", + ) + + table.add_row("--follow-redirects", "Automatically follow redirects.") + table.add_row("--no-verify", "Disable SSL verification.") + table.add_row( + "--http2", "Send the request using HTTP/2, if the remote server supports it." + ) + + table.add_row( + "--download [cyan]FILE", + "Save the response content as a file, rather than displaying it.", + ) + + table.add_row("-v, --verbose", "Verbose output. Show request as well as response.") + table.add_row("--help", "Show this message and exit.") + console.print(table) + + +def get_lexer_for_response(response: Response) -> str: + content_type = response.headers.get("Content-Type") + if content_type is not None: + mime_type, _, _ = content_type.partition(";") + try: + return typing.cast( + str, pygments.lexers.get_lexer_for_mimetype(mime_type.strip()).name + ) + except pygments.util.ClassNotFound: # pragma: no cover + pass + return "" # pragma: no cover + + +def format_request_headers(request: httpcore.Request, http2: bool = False) -> str: + version = "HTTP/2" if http2 else "HTTP/1.1" + headers = [ + (name.lower() if http2 else name, value) for name, value in request.headers + ] + method = request.method.decode("ascii") + target = request.url.target.decode("ascii") + lines = [f"{method} {target} {version}"] + [ + f"{name.decode('ascii')}: {value.decode('ascii')}" for name, value in headers + ] + return "\n".join(lines) + + +def format_response_headers( + http_version: bytes, + status: int, + reason_phrase: bytes | None, + headers: list[tuple[bytes, bytes]], +) -> str: + version = http_version.decode("ascii") + reason = ( + codes.get_reason_phrase(status) + if reason_phrase is None + else reason_phrase.decode("ascii") + ) + lines = [f"{version} {status} {reason}"] + [ + f"{name.decode('ascii')}: {value.decode('ascii')}" for name, value in headers + ] + return "\n".join(lines) + + +def print_request_headers(request: httpcore.Request, http2: bool = False) -> None: + console = rich.console.Console() + http_text = format_request_headers(request, http2=http2) + syntax = rich.syntax.Syntax(http_text, "http", theme="ansi_dark", word_wrap=True) + console.print(syntax) + syntax = rich.syntax.Syntax("", "http", theme="ansi_dark", word_wrap=True) + console.print(syntax) + + +def print_response_headers( + http_version: bytes, + status: int, + reason_phrase: bytes | None, + headers: list[tuple[bytes, bytes]], +) -> None: + console = rich.console.Console() + http_text = format_response_headers(http_version, status, reason_phrase, headers) + syntax = rich.syntax.Syntax(http_text, "http", theme="ansi_dark", word_wrap=True) + console.print(syntax) + syntax = rich.syntax.Syntax("", "http", theme="ansi_dark", word_wrap=True) + console.print(syntax) + + +def print_response(response: Response) -> None: + console = rich.console.Console() + lexer_name = get_lexer_for_response(response) + if lexer_name: + if lexer_name.lower() == "json": + try: + data = response.json() + text = json.dumps(data, indent=4) + except ValueError: # pragma: no cover + text = response.text + else: + text = response.text + + syntax = rich.syntax.Syntax(text, lexer_name, theme="ansi_dark", word_wrap=True) + console.print(syntax) + else: + console.print(f"<{len(response.content)} bytes of binary data>") + + +_PCTRTT = typing.Tuple[typing.Tuple[str, str], ...] +_PCTRTTT = typing.Tuple[_PCTRTT, ...] +_PeerCertRetDictType = typing.Dict[str, typing.Union[str, _PCTRTTT, _PCTRTT]] + + +def format_certificate(cert: _PeerCertRetDictType) -> str: # pragma: no cover + lines = [] + for key, value in cert.items(): + if isinstance(value, (list, tuple)): + lines.append(f"* {key}:") + for item in value: + if key in ("subject", "issuer"): + for sub_item in item: + lines.append(f"* {sub_item[0]}: {sub_item[1]!r}") + elif isinstance(item, tuple) and len(item) == 2: + lines.append(f"* {item[0]}: {item[1]!r}") + else: + lines.append(f"* {item!r}") + else: + lines.append(f"* {key}: {value!r}") + return "\n".join(lines) + + +def trace( + name: str, info: typing.Mapping[str, typing.Any], verbose: bool = False +) -> None: + console = rich.console.Console() + if name == "connection.connect_tcp.started" and verbose: + host = info["host"] + console.print(f"* Connecting to {host!r}") + elif name == "connection.connect_tcp.complete" and verbose: + stream = info["return_value"] + server_addr = stream.get_extra_info("server_addr") + console.print(f"* Connected to {server_addr[0]!r} on port {server_addr[1]}") + elif name == "connection.start_tls.complete" and verbose: # pragma: no cover + stream = info["return_value"] + ssl_object = stream.get_extra_info("ssl_object") + version = ssl_object.version() + cipher = ssl_object.cipher() + server_cert = ssl_object.getpeercert() + alpn = ssl_object.selected_alpn_protocol() + console.print(f"* SSL established using {version!r} / {cipher[0]!r}") + console.print(f"* Selected ALPN protocol: {alpn!r}") + if server_cert: + console.print("* Server certificate:") + console.print(format_certificate(server_cert)) + elif name == "http11.send_request_headers.started" and verbose: + request = info["request"] + print_request_headers(request, http2=False) + elif name == "http2.send_request_headers.started" and verbose: # pragma: no cover + request = info["request"] + print_request_headers(request, http2=True) + elif name == "http11.receive_response_headers.complete": + http_version, status, reason_phrase, headers = info["return_value"] + print_response_headers(http_version, status, reason_phrase, headers) + elif name == "http2.receive_response_headers.complete": # pragma: no cover + status, headers = info["return_value"] + http_version = b"HTTP/2" + reason_phrase = None + print_response_headers(http_version, status, reason_phrase, headers) + + +def download_response(response: Response, download: typing.BinaryIO) -> None: + console = rich.console.Console() + console.print() + content_length = response.headers.get("Content-Length") + with rich.progress.Progress( + "[progress.description]{task.description}", + "[progress.percentage]{task.percentage:>3.0f}%", + rich.progress.BarColumn(bar_width=None), + rich.progress.DownloadColumn(), + rich.progress.TransferSpeedColumn(), + ) as progress: + description = f"Downloading [bold]{rich.markup.escape(download.name)}" + download_task = progress.add_task( + description, + total=int(content_length or 0), + start=content_length is not None, + ) + for chunk in response.iter_bytes(): + download.write(chunk) + progress.update(download_task, completed=response.num_bytes_downloaded) + + +def validate_json( + ctx: click.Context, + param: click.Option | click.Parameter, + value: typing.Any, +) -> typing.Any: + if value is None: + return None + + try: + return json.loads(value) + except json.JSONDecodeError: # pragma: no cover + raise click.BadParameter("Not valid JSON") + + +def validate_auth( + ctx: click.Context, + param: click.Option | click.Parameter, + value: typing.Any, +) -> typing.Any: + if value == (None, None): + return None + + username, password = value + if password == "-": # pragma: no cover + password = click.prompt("Password", hide_input=True) + return (username, password) + + +def handle_help( + ctx: click.Context, + param: click.Option | click.Parameter, + value: typing.Any, +) -> None: + if not value or ctx.resilient_parsing: + return + + print_help() + ctx.exit() + + +@click.command(add_help_option=False) +@click.argument("url", type=str) +@click.option( + "--method", + "-m", + "method", + type=str, + help=( + "Request method, such as GET, POST, PUT, PATCH, DELETE, OPTIONS, HEAD. " + "[Default: GET, or POST if a request body is included]" + ), +) +@click.option( + "--params", + "-p", + "params", + type=(str, str), + multiple=True, + help="Query parameters to include in the request URL.", +) +@click.option( + "--content", + "-c", + "content", + type=str, + help="Byte content to include in the request body.", +) +@click.option( + "--data", + "-d", + "data", + type=(str, str), + multiple=True, + help="Form data to include in the request body.", +) +@click.option( + "--files", + "-f", + "files", + type=(str, click.File(mode="rb")), + multiple=True, + help="Form files to include in the request body.", +) +@click.option( + "--json", + "-j", + "json", + type=str, + callback=validate_json, + help="JSON data to include in the request body.", +) +@click.option( + "--headers", + "-h", + "headers", + type=(str, str), + multiple=True, + help="Include additional HTTP headers in the request.", +) +@click.option( + "--cookies", + "cookies", + type=(str, str), + multiple=True, + help="Cookies to include in the request.", +) +@click.option( + "--auth", + "auth", + type=(str, str), + default=(None, None), + callback=validate_auth, + help=( + "Username and password to include in the request. " + "Specify '-' for the password to use a password prompt. " + "Note that using --verbose/-v will expose the Authorization header, " + "including the password encoding in a trivially reversible format." + ), +) +@click.option( + "--proxy", + "proxy", + type=str, + default=None, + help="Send the request via a proxy. Should be the URL giving the proxy address.", +) +@click.option( + "--timeout", + "timeout", + type=float, + default=5.0, + help=( + "Timeout value to use for network operations, such as establishing the " + "connection, reading some data, etc... [Default: 5.0]" + ), +) +@click.option( + "--follow-redirects", + "follow_redirects", + is_flag=True, + default=False, + help="Automatically follow redirects.", +) +@click.option( + "--no-verify", + "verify", + is_flag=True, + default=True, + help="Disable SSL verification.", +) +@click.option( + "--http2", + "http2", + type=bool, + is_flag=True, + default=False, + help="Send the request using HTTP/2, if the remote server supports it.", +) +@click.option( + "--download", + type=click.File("wb"), + help="Save the response content as a file, rather than displaying it.", +) +@click.option( + "--verbose", + "-v", + type=bool, + is_flag=True, + default=False, + help="Verbose. Show request as well as response.", +) +@click.option( + "--help", + is_flag=True, + is_eager=True, + expose_value=False, + callback=handle_help, + help="Show this message and exit.", +) +def main( + url: str, + method: str, + params: list[tuple[str, str]], + content: str, + data: list[tuple[str, str]], + files: list[tuple[str, click.File]], + json: str, + headers: list[tuple[str, str]], + cookies: list[tuple[str, str]], + auth: tuple[str, str] | None, + proxy: str, + timeout: float, + follow_redirects: bool, + verify: bool, + http2: bool, + download: typing.BinaryIO | None, + verbose: bool, +) -> None: + """ + An HTTP command line client. + Sends a request and displays the response. + """ + if not method: + method = "POST" if content or data or files or json else "GET" + + try: + with Client(proxy=proxy, timeout=timeout, http2=http2, verify=verify) as client: + with client.stream( + method, + url, + params=list(params), + content=content, + data=dict(data), + files=files, # type: ignore + json=json, + headers=headers, + cookies=dict(cookies), + auth=auth, + follow_redirects=follow_redirects, + extensions={"trace": functools.partial(trace, verbose=verbose)}, + ) as response: + if download is not None: + download_response(response, download) + else: + response.read() + if response.content: + print_response(response) + + except RequestError as exc: + console = rich.console.Console() + console.print(f"[red]{type(exc).__name__}[/red]: {exc}") + sys.exit(1) + + sys.exit(0 if response.is_success else 1) diff --git a/env/lib/python3.13/site-packages/httpx/_models.py b/env/lib/python3.13/site-packages/httpx/_models.py new file mode 100644 index 0000000000000000000000000000000000000000..67d74bf86bfc80e22d9a4a3153572845accd9039 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_models.py @@ -0,0 +1,1277 @@ +from __future__ import annotations + +import codecs +import datetime +import email.message +import json as jsonlib +import re +import typing +import urllib.request +from collections.abc import Mapping +from http.cookiejar import Cookie, CookieJar + +from ._content import ByteStream, UnattachedStream, encode_request, encode_response +from ._decoders import ( + SUPPORTED_DECODERS, + ByteChunker, + ContentDecoder, + IdentityDecoder, + LineDecoder, + MultiDecoder, + TextChunker, + TextDecoder, +) +from ._exceptions import ( + CookieConflict, + HTTPStatusError, + RequestNotRead, + ResponseNotRead, + StreamClosed, + StreamConsumed, + request_context, +) +from ._multipart import get_multipart_boundary_from_content_type +from ._status_codes import codes +from ._types import ( + AsyncByteStream, + CookieTypes, + HeaderTypes, + QueryParamTypes, + RequestContent, + RequestData, + RequestExtensions, + RequestFiles, + ResponseContent, + ResponseExtensions, + SyncByteStream, +) +from ._urls import URL +from ._utils import to_bytes_or_str, to_str + +__all__ = ["Cookies", "Headers", "Request", "Response"] + +SENSITIVE_HEADERS = {"authorization", "proxy-authorization"} + + +def _is_known_encoding(encoding: str) -> bool: + """ + Return `True` if `encoding` is a known codec. + """ + try: + codecs.lookup(encoding) + except LookupError: + return False + return True + + +def _normalize_header_key(key: str | bytes, encoding: str | None = None) -> bytes: + """ + Coerce str/bytes into a strictly byte-wise HTTP header key. + """ + return key if isinstance(key, bytes) else key.encode(encoding or "ascii") + + +def _normalize_header_value(value: str | bytes, encoding: str | None = None) -> bytes: + """ + Coerce str/bytes into a strictly byte-wise HTTP header value. + """ + if isinstance(value, bytes): + return value + if not isinstance(value, str): + raise TypeError(f"Header value must be str or bytes, not {type(value)}") + return value.encode(encoding or "ascii") + + +def _parse_content_type_charset(content_type: str) -> str | None: + # We used to use `cgi.parse_header()` here, but `cgi` became a dead battery. + # See: https://peps.python.org/pep-0594/#cgi + msg = email.message.Message() + msg["content-type"] = content_type + return msg.get_content_charset(failobj=None) + + +def _parse_header_links(value: str) -> list[dict[str, str]]: + """ + Returns a list of parsed link headers, for more info see: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Link + The generic syntax of those is: + Link: < uri-reference >; param1=value1; param2="value2" + So for instance: + Link; '; type="image/jpeg",;' + would return + [ + {"url": "http:/.../front.jpeg", "type": "image/jpeg"}, + {"url": "http://.../back.jpeg"}, + ] + :param value: HTTP Link entity-header field + :return: list of parsed link headers + """ + links: list[dict[str, str]] = [] + replace_chars = " '\"" + value = value.strip(replace_chars) + if not value: + return links + for val in re.split(", *<", value): + try: + url, params = val.split(";", 1) + except ValueError: + url, params = val, "" + link = {"url": url.strip("<> '\"")} + for param in params.split(";"): + try: + key, value = param.split("=") + except ValueError: + break + link[key.strip(replace_chars)] = value.strip(replace_chars) + links.append(link) + return links + + +def _obfuscate_sensitive_headers( + items: typing.Iterable[tuple[typing.AnyStr, typing.AnyStr]], +) -> typing.Iterator[tuple[typing.AnyStr, typing.AnyStr]]: + for k, v in items: + if to_str(k.lower()) in SENSITIVE_HEADERS: + v = to_bytes_or_str("[secure]", match_type_of=v) + yield k, v + + +class Headers(typing.MutableMapping[str, str]): + """ + HTTP headers, as a case-insensitive multi-dict. + """ + + def __init__( + self, + headers: HeaderTypes | None = None, + encoding: str | None = None, + ) -> None: + self._list = [] # type: typing.List[typing.Tuple[bytes, bytes, bytes]] + + if isinstance(headers, Headers): + self._list = list(headers._list) + elif isinstance(headers, Mapping): + for k, v in headers.items(): + bytes_key = _normalize_header_key(k, encoding) + bytes_value = _normalize_header_value(v, encoding) + self._list.append((bytes_key, bytes_key.lower(), bytes_value)) + elif headers is not None: + for k, v in headers: + bytes_key = _normalize_header_key(k, encoding) + bytes_value = _normalize_header_value(v, encoding) + self._list.append((bytes_key, bytes_key.lower(), bytes_value)) + + self._encoding = encoding + + @property + def encoding(self) -> str: + """ + Header encoding is mandated as ascii, but we allow fallbacks to utf-8 + or iso-8859-1. + """ + if self._encoding is None: + for encoding in ["ascii", "utf-8"]: + for key, value in self.raw: + try: + key.decode(encoding) + value.decode(encoding) + except UnicodeDecodeError: + break + else: + # The else block runs if 'break' did not occur, meaning + # all values fitted the encoding. + self._encoding = encoding + break + else: + # The ISO-8859-1 encoding covers all 256 code points in a byte, + # so will never raise decode errors. + self._encoding = "iso-8859-1" + return self._encoding + + @encoding.setter + def encoding(self, value: str) -> None: + self._encoding = value + + @property + def raw(self) -> list[tuple[bytes, bytes]]: + """ + Returns a list of the raw header items, as byte pairs. + """ + return [(raw_key, value) for raw_key, _, value in self._list] + + def keys(self) -> typing.KeysView[str]: + return {key.decode(self.encoding): None for _, key, value in self._list}.keys() + + def values(self) -> typing.ValuesView[str]: + values_dict: dict[str, str] = {} + for _, key, value in self._list: + str_key = key.decode(self.encoding) + str_value = value.decode(self.encoding) + if str_key in values_dict: + values_dict[str_key] += f", {str_value}" + else: + values_dict[str_key] = str_value + return values_dict.values() + + def items(self) -> typing.ItemsView[str, str]: + """ + Return `(key, value)` items of headers. Concatenate headers + into a single comma separated value when a key occurs multiple times. + """ + values_dict: dict[str, str] = {} + for _, key, value in self._list: + str_key = key.decode(self.encoding) + str_value = value.decode(self.encoding) + if str_key in values_dict: + values_dict[str_key] += f", {str_value}" + else: + values_dict[str_key] = str_value + return values_dict.items() + + def multi_items(self) -> list[tuple[str, str]]: + """ + Return a list of `(key, value)` pairs of headers. Allow multiple + occurrences of the same key without concatenating into a single + comma separated value. + """ + return [ + (key.decode(self.encoding), value.decode(self.encoding)) + for _, key, value in self._list + ] + + def get(self, key: str, default: typing.Any = None) -> typing.Any: + """ + Return a header value. If multiple occurrences of the header occur + then concatenate them together with commas. + """ + try: + return self[key] + except KeyError: + return default + + def get_list(self, key: str, split_commas: bool = False) -> list[str]: + """ + Return a list of all header values for a given key. + If `split_commas=True` is passed, then any comma separated header + values are split into multiple return strings. + """ + get_header_key = key.lower().encode(self.encoding) + + values = [ + item_value.decode(self.encoding) + for _, item_key, item_value in self._list + if item_key.lower() == get_header_key + ] + + if not split_commas: + return values + + split_values = [] + for value in values: + split_values.extend([item.strip() for item in value.split(",")]) + return split_values + + def update(self, headers: HeaderTypes | None = None) -> None: # type: ignore + headers = Headers(headers) + for key in headers.keys(): + if key in self: + self.pop(key) + self._list.extend(headers._list) + + def copy(self) -> Headers: + return Headers(self, encoding=self.encoding) + + def __getitem__(self, key: str) -> str: + """ + Return a single header value. + + If there are multiple headers with the same key, then we concatenate + them with commas. See: https://tools.ietf.org/html/rfc7230#section-3.2.2 + """ + normalized_key = key.lower().encode(self.encoding) + + items = [ + header_value.decode(self.encoding) + for _, header_key, header_value in self._list + if header_key == normalized_key + ] + + if items: + return ", ".join(items) + + raise KeyError(key) + + def __setitem__(self, key: str, value: str) -> None: + """ + Set the header `key` to `value`, removing any duplicate entries. + Retains insertion order. + """ + set_key = key.encode(self._encoding or "utf-8") + set_value = value.encode(self._encoding or "utf-8") + lookup_key = set_key.lower() + + found_indexes = [ + idx + for idx, (_, item_key, _) in enumerate(self._list) + if item_key == lookup_key + ] + + for idx in reversed(found_indexes[1:]): + del self._list[idx] + + if found_indexes: + idx = found_indexes[0] + self._list[idx] = (set_key, lookup_key, set_value) + else: + self._list.append((set_key, lookup_key, set_value)) + + def __delitem__(self, key: str) -> None: + """ + Remove the header `key`. + """ + del_key = key.lower().encode(self.encoding) + + pop_indexes = [ + idx + for idx, (_, item_key, _) in enumerate(self._list) + if item_key.lower() == del_key + ] + + if not pop_indexes: + raise KeyError(key) + + for idx in reversed(pop_indexes): + del self._list[idx] + + def __contains__(self, key: typing.Any) -> bool: + header_key = key.lower().encode(self.encoding) + return header_key in [key for _, key, _ in self._list] + + def __iter__(self) -> typing.Iterator[typing.Any]: + return iter(self.keys()) + + def __len__(self) -> int: + return len(self._list) + + def __eq__(self, other: typing.Any) -> bool: + try: + other_headers = Headers(other) + except ValueError: + return False + + self_list = [(key, value) for _, key, value in self._list] + other_list = [(key, value) for _, key, value in other_headers._list] + return sorted(self_list) == sorted(other_list) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + + encoding_str = "" + if self.encoding != "ascii": + encoding_str = f", encoding={self.encoding!r}" + + as_list = list(_obfuscate_sensitive_headers(self.multi_items())) + as_dict = dict(as_list) + + no_duplicate_keys = len(as_dict) == len(as_list) + if no_duplicate_keys: + return f"{class_name}({as_dict!r}{encoding_str})" + return f"{class_name}({as_list!r}{encoding_str})" + + +class Request: + def __init__( + self, + method: str, + url: URL | str, + *, + params: QueryParamTypes | None = None, + headers: HeaderTypes | None = None, + cookies: CookieTypes | None = None, + content: RequestContent | None = None, + data: RequestData | None = None, + files: RequestFiles | None = None, + json: typing.Any | None = None, + stream: SyncByteStream | AsyncByteStream | None = None, + extensions: RequestExtensions | None = None, + ) -> None: + self.method = method.upper() + self.url = URL(url) if params is None else URL(url, params=params) + self.headers = Headers(headers) + self.extensions = {} if extensions is None else dict(extensions) + + if cookies: + Cookies(cookies).set_cookie_header(self) + + if stream is None: + content_type: str | None = self.headers.get("content-type") + headers, stream = encode_request( + content=content, + data=data, + files=files, + json=json, + boundary=get_multipart_boundary_from_content_type( + content_type=content_type.encode(self.headers.encoding) + if content_type + else None + ), + ) + self._prepare(headers) + self.stream = stream + # Load the request body, except for streaming content. + if isinstance(stream, ByteStream): + self.read() + else: + # There's an important distinction between `Request(content=...)`, + # and `Request(stream=...)`. + # + # Using `content=...` implies automatically populated `Host` and content + # headers, of either `Content-Length: ...` or `Transfer-Encoding: chunked`. + # + # Using `stream=...` will not automatically include *any* + # auto-populated headers. + # + # As an end-user you don't really need `stream=...`. It's only + # useful when: + # + # * Preserving the request stream when copying requests, eg for redirects. + # * Creating request instances on the *server-side* of the transport API. + self.stream = stream + + def _prepare(self, default_headers: dict[str, str]) -> None: + for key, value in default_headers.items(): + # Ignore Transfer-Encoding if the Content-Length has been set explicitly. + if key.lower() == "transfer-encoding" and "Content-Length" in self.headers: + continue + self.headers.setdefault(key, value) + + auto_headers: list[tuple[bytes, bytes]] = [] + + has_host = "Host" in self.headers + has_content_length = ( + "Content-Length" in self.headers or "Transfer-Encoding" in self.headers + ) + + if not has_host and self.url.host: + auto_headers.append((b"Host", self.url.netloc)) + if not has_content_length and self.method in ("POST", "PUT", "PATCH"): + auto_headers.append((b"Content-Length", b"0")) + + self.headers = Headers(auto_headers + self.headers.raw) + + @property + def content(self) -> bytes: + if not hasattr(self, "_content"): + raise RequestNotRead() + return self._content + + def read(self) -> bytes: + """ + Read and return the request content. + """ + if not hasattr(self, "_content"): + assert isinstance(self.stream, typing.Iterable) + self._content = b"".join(self.stream) + if not isinstance(self.stream, ByteStream): + # If a streaming request has been read entirely into memory, then + # we can replace the stream with a raw bytes implementation, + # to ensure that any non-replayable streams can still be used. + self.stream = ByteStream(self._content) + return self._content + + async def aread(self) -> bytes: + """ + Read and return the request content. + """ + if not hasattr(self, "_content"): + assert isinstance(self.stream, typing.AsyncIterable) + self._content = b"".join([part async for part in self.stream]) + if not isinstance(self.stream, ByteStream): + # If a streaming request has been read entirely into memory, then + # we can replace the stream with a raw bytes implementation, + # to ensure that any non-replayable streams can still be used. + self.stream = ByteStream(self._content) + return self._content + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + url = str(self.url) + return f"<{class_name}({self.method!r}, {url!r})>" + + def __getstate__(self) -> dict[str, typing.Any]: + return { + name: value + for name, value in self.__dict__.items() + if name not in ["extensions", "stream"] + } + + def __setstate__(self, state: dict[str, typing.Any]) -> None: + for name, value in state.items(): + setattr(self, name, value) + self.extensions = {} + self.stream = UnattachedStream() + + +class Response: + def __init__( + self, + status_code: int, + *, + headers: HeaderTypes | None = None, + content: ResponseContent | None = None, + text: str | None = None, + html: str | None = None, + json: typing.Any = None, + stream: SyncByteStream | AsyncByteStream | None = None, + request: Request | None = None, + extensions: ResponseExtensions | None = None, + history: list[Response] | None = None, + default_encoding: str | typing.Callable[[bytes], str] = "utf-8", + ) -> None: + self.status_code = status_code + self.headers = Headers(headers) + + self._request: Request | None = request + + # When follow_redirects=False and a redirect is received, + # the client will set `response.next_request`. + self.next_request: Request | None = None + + self.extensions = {} if extensions is None else dict(extensions) + self.history = [] if history is None else list(history) + + self.is_closed = False + self.is_stream_consumed = False + + self.default_encoding = default_encoding + + if stream is None: + headers, stream = encode_response(content, text, html, json) + self._prepare(headers) + self.stream = stream + if isinstance(stream, ByteStream): + # Load the response body, except for streaming content. + self.read() + else: + # There's an important distinction between `Response(content=...)`, + # and `Response(stream=...)`. + # + # Using `content=...` implies automatically populated content headers, + # of either `Content-Length: ...` or `Transfer-Encoding: chunked`. + # + # Using `stream=...` will not automatically include any content headers. + # + # As an end-user you don't really need `stream=...`. It's only + # useful when creating response instances having received a stream + # from the transport API. + self.stream = stream + + self._num_bytes_downloaded = 0 + + def _prepare(self, default_headers: dict[str, str]) -> None: + for key, value in default_headers.items(): + # Ignore Transfer-Encoding if the Content-Length has been set explicitly. + if key.lower() == "transfer-encoding" and "content-length" in self.headers: + continue + self.headers.setdefault(key, value) + + @property + def elapsed(self) -> datetime.timedelta: + """ + Returns the time taken for the complete request/response + cycle to complete. + """ + if not hasattr(self, "_elapsed"): + raise RuntimeError( + "'.elapsed' may only be accessed after the response " + "has been read or closed." + ) + return self._elapsed + + @elapsed.setter + def elapsed(self, elapsed: datetime.timedelta) -> None: + self._elapsed = elapsed + + @property + def request(self) -> Request: + """ + Returns the request instance associated to the current response. + """ + if self._request is None: + raise RuntimeError( + "The request instance has not been set on this response." + ) + return self._request + + @request.setter + def request(self, value: Request) -> None: + self._request = value + + @property + def http_version(self) -> str: + try: + http_version: bytes = self.extensions["http_version"] + except KeyError: + return "HTTP/1.1" + else: + return http_version.decode("ascii", errors="ignore") + + @property + def reason_phrase(self) -> str: + try: + reason_phrase: bytes = self.extensions["reason_phrase"] + except KeyError: + return codes.get_reason_phrase(self.status_code) + else: + return reason_phrase.decode("ascii", errors="ignore") + + @property + def url(self) -> URL: + """ + Returns the URL for which the request was made. + """ + return self.request.url + + @property + def content(self) -> bytes: + if not hasattr(self, "_content"): + raise ResponseNotRead() + return self._content + + @property + def text(self) -> str: + if not hasattr(self, "_text"): + content = self.content + if not content: + self._text = "" + else: + decoder = TextDecoder(encoding=self.encoding or "utf-8") + self._text = "".join([decoder.decode(self.content), decoder.flush()]) + return self._text + + @property + def encoding(self) -> str | None: + """ + Return an encoding to use for decoding the byte content into text. + The priority for determining this is given by... + + * `.encoding = <>` has been set explicitly. + * The encoding as specified by the charset parameter in the Content-Type header. + * The encoding as determined by `default_encoding`, which may either be + a string like "utf-8" indicating the encoding to use, or may be a callable + which enables charset autodetection. + """ + if not hasattr(self, "_encoding"): + encoding = self.charset_encoding + if encoding is None or not _is_known_encoding(encoding): + if isinstance(self.default_encoding, str): + encoding = self.default_encoding + elif hasattr(self, "_content"): + encoding = self.default_encoding(self._content) + self._encoding = encoding or "utf-8" + return self._encoding + + @encoding.setter + def encoding(self, value: str) -> None: + """ + Set the encoding to use for decoding the byte content into text. + + If the `text` attribute has been accessed, attempting to set the + encoding will throw a ValueError. + """ + if hasattr(self, "_text"): + raise ValueError( + "Setting encoding after `text` has been accessed is not allowed." + ) + self._encoding = value + + @property + def charset_encoding(self) -> str | None: + """ + Return the encoding, as specified by the Content-Type header. + """ + content_type = self.headers.get("Content-Type") + if content_type is None: + return None + + return _parse_content_type_charset(content_type) + + def _get_content_decoder(self) -> ContentDecoder: + """ + Returns a decoder instance which can be used to decode the raw byte + content, depending on the Content-Encoding used in the response. + """ + if not hasattr(self, "_decoder"): + decoders: list[ContentDecoder] = [] + values = self.headers.get_list("content-encoding", split_commas=True) + for value in values: + value = value.strip().lower() + try: + decoder_cls = SUPPORTED_DECODERS[value] + decoders.append(decoder_cls()) + except KeyError: + continue + + if len(decoders) == 1: + self._decoder = decoders[0] + elif len(decoders) > 1: + self._decoder = MultiDecoder(children=decoders) + else: + self._decoder = IdentityDecoder() + + return self._decoder + + @property + def is_informational(self) -> bool: + """ + A property which is `True` for 1xx status codes, `False` otherwise. + """ + return codes.is_informational(self.status_code) + + @property + def is_success(self) -> bool: + """ + A property which is `True` for 2xx status codes, `False` otherwise. + """ + return codes.is_success(self.status_code) + + @property + def is_redirect(self) -> bool: + """ + A property which is `True` for 3xx status codes, `False` otherwise. + + Note that not all responses with a 3xx status code indicate a URL redirect. + + Use `response.has_redirect_location` to determine responses with a properly + formed URL redirection. + """ + return codes.is_redirect(self.status_code) + + @property + def is_client_error(self) -> bool: + """ + A property which is `True` for 4xx status codes, `False` otherwise. + """ + return codes.is_client_error(self.status_code) + + @property + def is_server_error(self) -> bool: + """ + A property which is `True` for 5xx status codes, `False` otherwise. + """ + return codes.is_server_error(self.status_code) + + @property + def is_error(self) -> bool: + """ + A property which is `True` for 4xx and 5xx status codes, `False` otherwise. + """ + return codes.is_error(self.status_code) + + @property + def has_redirect_location(self) -> bool: + """ + Returns True for 3xx responses with a properly formed URL redirection, + `False` otherwise. + """ + return ( + self.status_code + in ( + # 301 (Cacheable redirect. Method may change to GET.) + codes.MOVED_PERMANENTLY, + # 302 (Uncacheable redirect. Method may change to GET.) + codes.FOUND, + # 303 (Client should make a GET or HEAD request.) + codes.SEE_OTHER, + # 307 (Equiv. 302, but retain method) + codes.TEMPORARY_REDIRECT, + # 308 (Equiv. 301, but retain method) + codes.PERMANENT_REDIRECT, + ) + and "Location" in self.headers + ) + + def raise_for_status(self) -> Response: + """ + Raise the `HTTPStatusError` if one occurred. + """ + request = self._request + if request is None: + raise RuntimeError( + "Cannot call `raise_for_status` as the request " + "instance has not been set on this response." + ) + + if self.is_success: + return self + + if self.has_redirect_location: + message = ( + "{error_type} '{0.status_code} {0.reason_phrase}' for url '{0.url}'\n" + "Redirect location: '{0.headers[location]}'\n" + "For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/{0.status_code}" + ) + else: + message = ( + "{error_type} '{0.status_code} {0.reason_phrase}' for url '{0.url}'\n" + "For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/{0.status_code}" + ) + + status_class = self.status_code // 100 + error_types = { + 1: "Informational response", + 3: "Redirect response", + 4: "Client error", + 5: "Server error", + } + error_type = error_types.get(status_class, "Invalid status code") + message = message.format(self, error_type=error_type) + raise HTTPStatusError(message, request=request, response=self) + + def json(self, **kwargs: typing.Any) -> typing.Any: + return jsonlib.loads(self.content, **kwargs) + + @property + def cookies(self) -> Cookies: + if not hasattr(self, "_cookies"): + self._cookies = Cookies() + self._cookies.extract_cookies(self) + return self._cookies + + @property + def links(self) -> dict[str | None, dict[str, str]]: + """ + Returns the parsed header links of the response, if any + """ + header = self.headers.get("link") + if header is None: + return {} + + return { + (link.get("rel") or link.get("url")): link + for link in _parse_header_links(header) + } + + @property + def num_bytes_downloaded(self) -> int: + return self._num_bytes_downloaded + + def __repr__(self) -> str: + return f"" + + def __getstate__(self) -> dict[str, typing.Any]: + return { + name: value + for name, value in self.__dict__.items() + if name not in ["extensions", "stream", "is_closed", "_decoder"] + } + + def __setstate__(self, state: dict[str, typing.Any]) -> None: + for name, value in state.items(): + setattr(self, name, value) + self.is_closed = True + self.extensions = {} + self.stream = UnattachedStream() + + def read(self) -> bytes: + """ + Read and return the response content. + """ + if not hasattr(self, "_content"): + self._content = b"".join(self.iter_bytes()) + return self._content + + def iter_bytes(self, chunk_size: int | None = None) -> typing.Iterator[bytes]: + """ + A byte-iterator over the decoded response content. + This allows us to handle gzip, deflate, brotli, and zstd encoded responses. + """ + if hasattr(self, "_content"): + chunk_size = len(self._content) if chunk_size is None else chunk_size + for i in range(0, len(self._content), max(chunk_size, 1)): + yield self._content[i : i + chunk_size] + else: + decoder = self._get_content_decoder() + chunker = ByteChunker(chunk_size=chunk_size) + with request_context(request=self._request): + for raw_bytes in self.iter_raw(): + decoded = decoder.decode(raw_bytes) + for chunk in chunker.decode(decoded): + yield chunk + decoded = decoder.flush() + for chunk in chunker.decode(decoded): + yield chunk # pragma: no cover + for chunk in chunker.flush(): + yield chunk + + def iter_text(self, chunk_size: int | None = None) -> typing.Iterator[str]: + """ + A str-iterator over the decoded response content + that handles both gzip, deflate, etc but also detects the content's + string encoding. + """ + decoder = TextDecoder(encoding=self.encoding or "utf-8") + chunker = TextChunker(chunk_size=chunk_size) + with request_context(request=self._request): + for byte_content in self.iter_bytes(): + text_content = decoder.decode(byte_content) + for chunk in chunker.decode(text_content): + yield chunk + text_content = decoder.flush() + for chunk in chunker.decode(text_content): + yield chunk # pragma: no cover + for chunk in chunker.flush(): + yield chunk + + def iter_lines(self) -> typing.Iterator[str]: + decoder = LineDecoder() + with request_context(request=self._request): + for text in self.iter_text(): + for line in decoder.decode(text): + yield line + for line in decoder.flush(): + yield line + + def iter_raw(self, chunk_size: int | None = None) -> typing.Iterator[bytes]: + """ + A byte-iterator over the raw response content. + """ + if self.is_stream_consumed: + raise StreamConsumed() + if self.is_closed: + raise StreamClosed() + if not isinstance(self.stream, SyncByteStream): + raise RuntimeError("Attempted to call a sync iterator on an async stream.") + + self.is_stream_consumed = True + self._num_bytes_downloaded = 0 + chunker = ByteChunker(chunk_size=chunk_size) + + with request_context(request=self._request): + for raw_stream_bytes in self.stream: + self._num_bytes_downloaded += len(raw_stream_bytes) + for chunk in chunker.decode(raw_stream_bytes): + yield chunk + + for chunk in chunker.flush(): + yield chunk + + self.close() + + def close(self) -> None: + """ + Close the response and release the connection. + Automatically called if the response body is read to completion. + """ + if not isinstance(self.stream, SyncByteStream): + raise RuntimeError("Attempted to call an sync close on an async stream.") + + if not self.is_closed: + self.is_closed = True + with request_context(request=self._request): + self.stream.close() + + async def aread(self) -> bytes: + """ + Read and return the response content. + """ + if not hasattr(self, "_content"): + self._content = b"".join([part async for part in self.aiter_bytes()]) + return self._content + + async def aiter_bytes( + self, chunk_size: int | None = None + ) -> typing.AsyncIterator[bytes]: + """ + A byte-iterator over the decoded response content. + This allows us to handle gzip, deflate, brotli, and zstd encoded responses. + """ + if hasattr(self, "_content"): + chunk_size = len(self._content) if chunk_size is None else chunk_size + for i in range(0, len(self._content), max(chunk_size, 1)): + yield self._content[i : i + chunk_size] + else: + decoder = self._get_content_decoder() + chunker = ByteChunker(chunk_size=chunk_size) + with request_context(request=self._request): + async for raw_bytes in self.aiter_raw(): + decoded = decoder.decode(raw_bytes) + for chunk in chunker.decode(decoded): + yield chunk + decoded = decoder.flush() + for chunk in chunker.decode(decoded): + yield chunk # pragma: no cover + for chunk in chunker.flush(): + yield chunk + + async def aiter_text( + self, chunk_size: int | None = None + ) -> typing.AsyncIterator[str]: + """ + A str-iterator over the decoded response content + that handles both gzip, deflate, etc but also detects the content's + string encoding. + """ + decoder = TextDecoder(encoding=self.encoding or "utf-8") + chunker = TextChunker(chunk_size=chunk_size) + with request_context(request=self._request): + async for byte_content in self.aiter_bytes(): + text_content = decoder.decode(byte_content) + for chunk in chunker.decode(text_content): + yield chunk + text_content = decoder.flush() + for chunk in chunker.decode(text_content): + yield chunk # pragma: no cover + for chunk in chunker.flush(): + yield chunk + + async def aiter_lines(self) -> typing.AsyncIterator[str]: + decoder = LineDecoder() + with request_context(request=self._request): + async for text in self.aiter_text(): + for line in decoder.decode(text): + yield line + for line in decoder.flush(): + yield line + + async def aiter_raw( + self, chunk_size: int | None = None + ) -> typing.AsyncIterator[bytes]: + """ + A byte-iterator over the raw response content. + """ + if self.is_stream_consumed: + raise StreamConsumed() + if self.is_closed: + raise StreamClosed() + if not isinstance(self.stream, AsyncByteStream): + raise RuntimeError("Attempted to call an async iterator on an sync stream.") + + self.is_stream_consumed = True + self._num_bytes_downloaded = 0 + chunker = ByteChunker(chunk_size=chunk_size) + + with request_context(request=self._request): + async for raw_stream_bytes in self.stream: + self._num_bytes_downloaded += len(raw_stream_bytes) + for chunk in chunker.decode(raw_stream_bytes): + yield chunk + + for chunk in chunker.flush(): + yield chunk + + await self.aclose() + + async def aclose(self) -> None: + """ + Close the response and release the connection. + Automatically called if the response body is read to completion. + """ + if not isinstance(self.stream, AsyncByteStream): + raise RuntimeError("Attempted to call an async close on an sync stream.") + + if not self.is_closed: + self.is_closed = True + with request_context(request=self._request): + await self.stream.aclose() + + +class Cookies(typing.MutableMapping[str, str]): + """ + HTTP Cookies, as a mutable mapping. + """ + + def __init__(self, cookies: CookieTypes | None = None) -> None: + if cookies is None or isinstance(cookies, dict): + self.jar = CookieJar() + if isinstance(cookies, dict): + for key, value in cookies.items(): + self.set(key, value) + elif isinstance(cookies, list): + self.jar = CookieJar() + for key, value in cookies: + self.set(key, value) + elif isinstance(cookies, Cookies): + self.jar = CookieJar() + for cookie in cookies.jar: + self.jar.set_cookie(cookie) + else: + self.jar = cookies + + def extract_cookies(self, response: Response) -> None: + """ + Loads any cookies based on the response `Set-Cookie` headers. + """ + urllib_response = self._CookieCompatResponse(response) + urllib_request = self._CookieCompatRequest(response.request) + + self.jar.extract_cookies(urllib_response, urllib_request) # type: ignore + + def set_cookie_header(self, request: Request) -> None: + """ + Sets an appropriate 'Cookie:' HTTP header on the `Request`. + """ + urllib_request = self._CookieCompatRequest(request) + self.jar.add_cookie_header(urllib_request) + + def set(self, name: str, value: str, domain: str = "", path: str = "/") -> None: + """ + Set a cookie value by name. May optionally include domain and path. + """ + kwargs = { + "version": 0, + "name": name, + "value": value, + "port": None, + "port_specified": False, + "domain": domain, + "domain_specified": bool(domain), + "domain_initial_dot": domain.startswith("."), + "path": path, + "path_specified": bool(path), + "secure": False, + "expires": None, + "discard": True, + "comment": None, + "comment_url": None, + "rest": {"HttpOnly": None}, + "rfc2109": False, + } + cookie = Cookie(**kwargs) # type: ignore + self.jar.set_cookie(cookie) + + def get( # type: ignore + self, + name: str, + default: str | None = None, + domain: str | None = None, + path: str | None = None, + ) -> str | None: + """ + Get a cookie by name. May optionally include domain and path + in order to specify exactly which cookie to retrieve. + """ + value = None + for cookie in self.jar: + if cookie.name == name: + if domain is None or cookie.domain == domain: + if path is None or cookie.path == path: + if value is not None: + message = f"Multiple cookies exist with name={name}" + raise CookieConflict(message) + value = cookie.value + + if value is None: + return default + return value + + def delete( + self, + name: str, + domain: str | None = None, + path: str | None = None, + ) -> None: + """ + Delete a cookie by name. May optionally include domain and path + in order to specify exactly which cookie to delete. + """ + if domain is not None and path is not None: + return self.jar.clear(domain, path, name) + + remove = [ + cookie + for cookie in self.jar + if cookie.name == name + and (domain is None or cookie.domain == domain) + and (path is None or cookie.path == path) + ] + + for cookie in remove: + self.jar.clear(cookie.domain, cookie.path, cookie.name) + + def clear(self, domain: str | None = None, path: str | None = None) -> None: + """ + Delete all cookies. Optionally include a domain and path in + order to only delete a subset of all the cookies. + """ + args = [] + if domain is not None: + args.append(domain) + if path is not None: + assert domain is not None + args.append(path) + self.jar.clear(*args) + + def update(self, cookies: CookieTypes | None = None) -> None: # type: ignore + cookies = Cookies(cookies) + for cookie in cookies.jar: + self.jar.set_cookie(cookie) + + def __setitem__(self, name: str, value: str) -> None: + return self.set(name, value) + + def __getitem__(self, name: str) -> str: + value = self.get(name) + if value is None: + raise KeyError(name) + return value + + def __delitem__(self, name: str) -> None: + return self.delete(name) + + def __len__(self) -> int: + return len(self.jar) + + def __iter__(self) -> typing.Iterator[str]: + return (cookie.name for cookie in self.jar) + + def __bool__(self) -> bool: + for _ in self.jar: + return True + return False + + def __repr__(self) -> str: + cookies_repr = ", ".join( + [ + f"" + for cookie in self.jar + ] + ) + + return f"" + + class _CookieCompatRequest(urllib.request.Request): + """ + Wraps a `Request` instance up in a compatibility interface suitable + for use with `CookieJar` operations. + """ + + def __init__(self, request: Request) -> None: + super().__init__( + url=str(request.url), + headers=dict(request.headers), + method=request.method, + ) + self.request = request + + def add_unredirected_header(self, key: str, value: str) -> None: + super().add_unredirected_header(key, value) + self.request.headers[key] = value + + class _CookieCompatResponse: + """ + Wraps a `Request` instance up in a compatibility interface suitable + for use with `CookieJar` operations. + """ + + def __init__(self, response: Response) -> None: + self.response = response + + def info(self) -> email.message.Message: + info = email.message.Message() + for key, value in self.response.headers.multi_items(): + # Note that setting `info[key]` here is an "append" operation, + # not a "replace" operation. + # https://docs.python.org/3/library/email.compat32-message.html#email.message.Message.__setitem__ + info[key] = value + return info diff --git a/env/lib/python3.13/site-packages/httpx/_multipart.py b/env/lib/python3.13/site-packages/httpx/_multipart.py new file mode 100644 index 0000000000000000000000000000000000000000..b4761af9b2cf384de5189269927d781a700dbe46 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_multipart.py @@ -0,0 +1,300 @@ +from __future__ import annotations + +import io +import mimetypes +import os +import re +import typing +from pathlib import Path + +from ._types import ( + AsyncByteStream, + FileContent, + FileTypes, + RequestData, + RequestFiles, + SyncByteStream, +) +from ._utils import ( + peek_filelike_length, + primitive_value_to_str, + to_bytes, +) + +_HTML5_FORM_ENCODING_REPLACEMENTS = {'"': "%22", "\\": "\\\\"} +_HTML5_FORM_ENCODING_REPLACEMENTS.update( + {chr(c): "%{:02X}".format(c) for c in range(0x1F + 1) if c != 0x1B} +) +_HTML5_FORM_ENCODING_RE = re.compile( + r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()]) +) + + +def _format_form_param(name: str, value: str) -> bytes: + """ + Encode a name/value pair within a multipart form. + """ + + def replacer(match: typing.Match[str]) -> str: + return _HTML5_FORM_ENCODING_REPLACEMENTS[match.group(0)] + + value = _HTML5_FORM_ENCODING_RE.sub(replacer, value) + return f'{name}="{value}"'.encode() + + +def _guess_content_type(filename: str | None) -> str | None: + """ + Guesses the mimetype based on a filename. Defaults to `application/octet-stream`. + + Returns `None` if `filename` is `None` or empty. + """ + if filename: + return mimetypes.guess_type(filename)[0] or "application/octet-stream" + return None + + +def get_multipart_boundary_from_content_type( + content_type: bytes | None, +) -> bytes | None: + if not content_type or not content_type.startswith(b"multipart/form-data"): + return None + # parse boundary according to + # https://www.rfc-editor.org/rfc/rfc2046#section-5.1.1 + if b";" in content_type: + for section in content_type.split(b";"): + if section.strip().lower().startswith(b"boundary="): + return section.strip()[len(b"boundary=") :].strip(b'"') + return None + + +class DataField: + """ + A single form field item, within a multipart form field. + """ + + def __init__(self, name: str, value: str | bytes | int | float | None) -> None: + if not isinstance(name, str): + raise TypeError( + f"Invalid type for name. Expected str, got {type(name)}: {name!r}" + ) + if value is not None and not isinstance(value, (str, bytes, int, float)): + raise TypeError( + "Invalid type for value. Expected primitive type," + f" got {type(value)}: {value!r}" + ) + self.name = name + self.value: str | bytes = ( + value if isinstance(value, bytes) else primitive_value_to_str(value) + ) + + def render_headers(self) -> bytes: + if not hasattr(self, "_headers"): + name = _format_form_param("name", self.name) + self._headers = b"".join( + [b"Content-Disposition: form-data; ", name, b"\r\n\r\n"] + ) + + return self._headers + + def render_data(self) -> bytes: + if not hasattr(self, "_data"): + self._data = to_bytes(self.value) + + return self._data + + def get_length(self) -> int: + headers = self.render_headers() + data = self.render_data() + return len(headers) + len(data) + + def render(self) -> typing.Iterator[bytes]: + yield self.render_headers() + yield self.render_data() + + +class FileField: + """ + A single file field item, within a multipart form field. + """ + + CHUNK_SIZE = 64 * 1024 + + def __init__(self, name: str, value: FileTypes) -> None: + self.name = name + + fileobj: FileContent + + headers: dict[str, str] = {} + content_type: str | None = None + + # This large tuple based API largely mirror's requests' API + # It would be good to think of better APIs for this that we could + # include in httpx 2.0 since variable length tuples(especially of 4 elements) + # are quite unwieldly + if isinstance(value, tuple): + if len(value) == 2: + # neither the 3rd parameter (content_type) nor the 4th (headers) + # was included + filename, fileobj = value + elif len(value) == 3: + filename, fileobj, content_type = value + else: + # all 4 parameters included + filename, fileobj, content_type, headers = value # type: ignore + else: + filename = Path(str(getattr(value, "name", "upload"))).name + fileobj = value + + if content_type is None: + content_type = _guess_content_type(filename) + + has_content_type_header = any("content-type" in key.lower() for key in headers) + if content_type is not None and not has_content_type_header: + # note that unlike requests, we ignore the content_type provided in the 3rd + # tuple element if it is also included in the headers requests does + # the opposite (it overwrites the headerwith the 3rd tuple element) + headers["Content-Type"] = content_type + + if isinstance(fileobj, io.StringIO): + raise TypeError( + "Multipart file uploads require 'io.BytesIO', not 'io.StringIO'." + ) + if isinstance(fileobj, io.TextIOBase): + raise TypeError( + "Multipart file uploads must be opened in binary mode, not text mode." + ) + + self.filename = filename + self.file = fileobj + self.headers = headers + + def get_length(self) -> int | None: + headers = self.render_headers() + + if isinstance(self.file, (str, bytes)): + return len(headers) + len(to_bytes(self.file)) + + file_length = peek_filelike_length(self.file) + + # If we can't determine the filesize without reading it into memory, + # then return `None` here, to indicate an unknown file length. + if file_length is None: + return None + + return len(headers) + file_length + + def render_headers(self) -> bytes: + if not hasattr(self, "_headers"): + parts = [ + b"Content-Disposition: form-data; ", + _format_form_param("name", self.name), + ] + if self.filename: + filename = _format_form_param("filename", self.filename) + parts.extend([b"; ", filename]) + for header_name, header_value in self.headers.items(): + key, val = f"\r\n{header_name}: ".encode(), header_value.encode() + parts.extend([key, val]) + parts.append(b"\r\n\r\n") + self._headers = b"".join(parts) + + return self._headers + + def render_data(self) -> typing.Iterator[bytes]: + if isinstance(self.file, (str, bytes)): + yield to_bytes(self.file) + return + + if hasattr(self.file, "seek"): + try: + self.file.seek(0) + except io.UnsupportedOperation: + pass + + chunk = self.file.read(self.CHUNK_SIZE) + while chunk: + yield to_bytes(chunk) + chunk = self.file.read(self.CHUNK_SIZE) + + def render(self) -> typing.Iterator[bytes]: + yield self.render_headers() + yield from self.render_data() + + +class MultipartStream(SyncByteStream, AsyncByteStream): + """ + Request content as streaming multipart encoded form data. + """ + + def __init__( + self, + data: RequestData, + files: RequestFiles, + boundary: bytes | None = None, + ) -> None: + if boundary is None: + boundary = os.urandom(16).hex().encode("ascii") + + self.boundary = boundary + self.content_type = "multipart/form-data; boundary=%s" % boundary.decode( + "ascii" + ) + self.fields = list(self._iter_fields(data, files)) + + def _iter_fields( + self, data: RequestData, files: RequestFiles + ) -> typing.Iterator[FileField | DataField]: + for name, value in data.items(): + if isinstance(value, (tuple, list)): + for item in value: + yield DataField(name=name, value=item) + else: + yield DataField(name=name, value=value) + + file_items = files.items() if isinstance(files, typing.Mapping) else files + for name, value in file_items: + yield FileField(name=name, value=value) + + def iter_chunks(self) -> typing.Iterator[bytes]: + for field in self.fields: + yield b"--%s\r\n" % self.boundary + yield from field.render() + yield b"\r\n" + yield b"--%s--\r\n" % self.boundary + + def get_content_length(self) -> int | None: + """ + Return the length of the multipart encoded content, or `None` if + any of the files have a length that cannot be determined upfront. + """ + boundary_length = len(self.boundary) + length = 0 + + for field in self.fields: + field_length = field.get_length() + if field_length is None: + return None + + length += 2 + boundary_length + 2 # b"--{boundary}\r\n" + length += field_length + length += 2 # b"\r\n" + + length += 2 + boundary_length + 4 # b"--{boundary}--\r\n" + return length + + # Content stream interface. + + def get_headers(self) -> dict[str, str]: + content_length = self.get_content_length() + content_type = self.content_type + if content_length is None: + return {"Transfer-Encoding": "chunked", "Content-Type": content_type} + return {"Content-Length": str(content_length), "Content-Type": content_type} + + def __iter__(self) -> typing.Iterator[bytes]: + for chunk in self.iter_chunks(): + yield chunk + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + for chunk in self.iter_chunks(): + yield chunk diff --git a/env/lib/python3.13/site-packages/httpx/_status_codes.py b/env/lib/python3.13/site-packages/httpx/_status_codes.py new file mode 100644 index 0000000000000000000000000000000000000000..133a6231a5b53fd2f073799ca1bd07c50abe40ae --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_status_codes.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +from enum import IntEnum + +__all__ = ["codes"] + + +class codes(IntEnum): + """HTTP status codes and reason phrases + + Status codes from the following RFCs are all observed: + + * RFC 7231: Hypertext Transfer Protocol (HTTP/1.1), obsoletes 2616 + * RFC 6585: Additional HTTP Status Codes + * RFC 3229: Delta encoding in HTTP + * RFC 4918: HTTP Extensions for WebDAV, obsoletes 2518 + * RFC 5842: Binding Extensions to WebDAV + * RFC 7238: Permanent Redirect + * RFC 2295: Transparent Content Negotiation in HTTP + * RFC 2774: An HTTP Extension Framework + * RFC 7540: Hypertext Transfer Protocol Version 2 (HTTP/2) + * RFC 2324: Hyper Text Coffee Pot Control Protocol (HTCPCP/1.0) + * RFC 7725: An HTTP Status Code to Report Legal Obstacles + * RFC 8297: An HTTP Status Code for Indicating Hints + * RFC 8470: Using Early Data in HTTP + """ + + def __new__(cls, value: int, phrase: str = "") -> codes: + obj = int.__new__(cls, value) + obj._value_ = value + + obj.phrase = phrase # type: ignore[attr-defined] + return obj + + def __str__(self) -> str: + return str(self.value) + + @classmethod + def get_reason_phrase(cls, value: int) -> str: + try: + return codes(value).phrase # type: ignore + except ValueError: + return "" + + @classmethod + def is_informational(cls, value: int) -> bool: + """ + Returns `True` for 1xx status codes, `False` otherwise. + """ + return 100 <= value <= 199 + + @classmethod + def is_success(cls, value: int) -> bool: + """ + Returns `True` for 2xx status codes, `False` otherwise. + """ + return 200 <= value <= 299 + + @classmethod + def is_redirect(cls, value: int) -> bool: + """ + Returns `True` for 3xx status codes, `False` otherwise. + """ + return 300 <= value <= 399 + + @classmethod + def is_client_error(cls, value: int) -> bool: + """ + Returns `True` for 4xx status codes, `False` otherwise. + """ + return 400 <= value <= 499 + + @classmethod + def is_server_error(cls, value: int) -> bool: + """ + Returns `True` for 5xx status codes, `False` otherwise. + """ + return 500 <= value <= 599 + + @classmethod + def is_error(cls, value: int) -> bool: + """ + Returns `True` for 4xx or 5xx status codes, `False` otherwise. + """ + return 400 <= value <= 599 + + # informational + CONTINUE = 100, "Continue" + SWITCHING_PROTOCOLS = 101, "Switching Protocols" + PROCESSING = 102, "Processing" + EARLY_HINTS = 103, "Early Hints" + + # success + OK = 200, "OK" + CREATED = 201, "Created" + ACCEPTED = 202, "Accepted" + NON_AUTHORITATIVE_INFORMATION = 203, "Non-Authoritative Information" + NO_CONTENT = 204, "No Content" + RESET_CONTENT = 205, "Reset Content" + PARTIAL_CONTENT = 206, "Partial Content" + MULTI_STATUS = 207, "Multi-Status" + ALREADY_REPORTED = 208, "Already Reported" + IM_USED = 226, "IM Used" + + # redirection + MULTIPLE_CHOICES = 300, "Multiple Choices" + MOVED_PERMANENTLY = 301, "Moved Permanently" + FOUND = 302, "Found" + SEE_OTHER = 303, "See Other" + NOT_MODIFIED = 304, "Not Modified" + USE_PROXY = 305, "Use Proxy" + TEMPORARY_REDIRECT = 307, "Temporary Redirect" + PERMANENT_REDIRECT = 308, "Permanent Redirect" + + # client error + BAD_REQUEST = 400, "Bad Request" + UNAUTHORIZED = 401, "Unauthorized" + PAYMENT_REQUIRED = 402, "Payment Required" + FORBIDDEN = 403, "Forbidden" + NOT_FOUND = 404, "Not Found" + METHOD_NOT_ALLOWED = 405, "Method Not Allowed" + NOT_ACCEPTABLE = 406, "Not Acceptable" + PROXY_AUTHENTICATION_REQUIRED = 407, "Proxy Authentication Required" + REQUEST_TIMEOUT = 408, "Request Timeout" + CONFLICT = 409, "Conflict" + GONE = 410, "Gone" + LENGTH_REQUIRED = 411, "Length Required" + PRECONDITION_FAILED = 412, "Precondition Failed" + REQUEST_ENTITY_TOO_LARGE = 413, "Request Entity Too Large" + REQUEST_URI_TOO_LONG = 414, "Request-URI Too Long" + UNSUPPORTED_MEDIA_TYPE = 415, "Unsupported Media Type" + REQUESTED_RANGE_NOT_SATISFIABLE = 416, "Requested Range Not Satisfiable" + EXPECTATION_FAILED = 417, "Expectation Failed" + IM_A_TEAPOT = 418, "I'm a teapot" + MISDIRECTED_REQUEST = 421, "Misdirected Request" + UNPROCESSABLE_ENTITY = 422, "Unprocessable Entity" + LOCKED = 423, "Locked" + FAILED_DEPENDENCY = 424, "Failed Dependency" + TOO_EARLY = 425, "Too Early" + UPGRADE_REQUIRED = 426, "Upgrade Required" + PRECONDITION_REQUIRED = 428, "Precondition Required" + TOO_MANY_REQUESTS = 429, "Too Many Requests" + REQUEST_HEADER_FIELDS_TOO_LARGE = 431, "Request Header Fields Too Large" + UNAVAILABLE_FOR_LEGAL_REASONS = 451, "Unavailable For Legal Reasons" + + # server errors + INTERNAL_SERVER_ERROR = 500, "Internal Server Error" + NOT_IMPLEMENTED = 501, "Not Implemented" + BAD_GATEWAY = 502, "Bad Gateway" + SERVICE_UNAVAILABLE = 503, "Service Unavailable" + GATEWAY_TIMEOUT = 504, "Gateway Timeout" + HTTP_VERSION_NOT_SUPPORTED = 505, "HTTP Version Not Supported" + VARIANT_ALSO_NEGOTIATES = 506, "Variant Also Negotiates" + INSUFFICIENT_STORAGE = 507, "Insufficient Storage" + LOOP_DETECTED = 508, "Loop Detected" + NOT_EXTENDED = 510, "Not Extended" + NETWORK_AUTHENTICATION_REQUIRED = 511, "Network Authentication Required" + + +# Include lower-case styles for `requests` compatibility. +for code in codes: + setattr(codes, code._name_.lower(), int(code)) diff --git a/env/lib/python3.13/site-packages/httpx/_types.py b/env/lib/python3.13/site-packages/httpx/_types.py new file mode 100644 index 0000000000000000000000000000000000000000..704dfdffc8ba61eb913fa918072381e410b23c00 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_types.py @@ -0,0 +1,114 @@ +""" +Type definitions for type checking purposes. +""" + +from http.cookiejar import CookieJar +from typing import ( + IO, + TYPE_CHECKING, + Any, + AsyncIterable, + AsyncIterator, + Callable, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) + +if TYPE_CHECKING: # pragma: no cover + from ._auth import Auth # noqa: F401 + from ._config import Proxy, Timeout # noqa: F401 + from ._models import Cookies, Headers, Request # noqa: F401 + from ._urls import URL, QueryParams # noqa: F401 + + +PrimitiveData = Optional[Union[str, int, float, bool]] + +URLTypes = Union["URL", str] + +QueryParamTypes = Union[ + "QueryParams", + Mapping[str, Union[PrimitiveData, Sequence[PrimitiveData]]], + List[Tuple[str, PrimitiveData]], + Tuple[Tuple[str, PrimitiveData], ...], + str, + bytes, +] + +HeaderTypes = Union[ + "Headers", + Mapping[str, str], + Mapping[bytes, bytes], + Sequence[Tuple[str, str]], + Sequence[Tuple[bytes, bytes]], +] + +CookieTypes = Union["Cookies", CookieJar, Dict[str, str], List[Tuple[str, str]]] + +TimeoutTypes = Union[ + Optional[float], + Tuple[Optional[float], Optional[float], Optional[float], Optional[float]], + "Timeout", +] +ProxyTypes = Union["URL", str, "Proxy"] +CertTypes = Union[str, Tuple[str, str], Tuple[str, str, str]] + +AuthTypes = Union[ + Tuple[Union[str, bytes], Union[str, bytes]], + Callable[["Request"], "Request"], + "Auth", +] + +RequestContent = Union[str, bytes, Iterable[bytes], AsyncIterable[bytes]] +ResponseContent = Union[str, bytes, Iterable[bytes], AsyncIterable[bytes]] +ResponseExtensions = Mapping[str, Any] + +RequestData = Mapping[str, Any] + +FileContent = Union[IO[bytes], bytes, str] +FileTypes = Union[ + # file (or bytes) + FileContent, + # (filename, file (or bytes)) + Tuple[Optional[str], FileContent], + # (filename, file (or bytes), content_type) + Tuple[Optional[str], FileContent, Optional[str]], + # (filename, file (or bytes), content_type, headers) + Tuple[Optional[str], FileContent, Optional[str], Mapping[str, str]], +] +RequestFiles = Union[Mapping[str, FileTypes], Sequence[Tuple[str, FileTypes]]] + +RequestExtensions = Mapping[str, Any] + +__all__ = ["AsyncByteStream", "SyncByteStream"] + + +class SyncByteStream: + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError( + "The '__iter__' method must be implemented." + ) # pragma: no cover + yield b"" # pragma: no cover + + def close(self) -> None: + """ + Subclasses can override this method to release any network resources + after a request/response cycle is complete. + """ + + +class AsyncByteStream: + async def __aiter__(self) -> AsyncIterator[bytes]: + raise NotImplementedError( + "The '__aiter__' method must be implemented." + ) # pragma: no cover + yield b"" # pragma: no cover + + async def aclose(self) -> None: + pass diff --git a/env/lib/python3.13/site-packages/httpx/_urlparse.py b/env/lib/python3.13/site-packages/httpx/_urlparse.py new file mode 100644 index 0000000000000000000000000000000000000000..bf190fd560ee4fc8a11af371a15fc5f1dc284d34 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_urlparse.py @@ -0,0 +1,527 @@ +""" +An implementation of `urlparse` that provides URL validation and normalization +as described by RFC3986. + +We rely on this implementation rather than the one in Python's stdlib, because: + +* It provides more complete URL validation. +* It properly differentiates between an empty querystring and an absent querystring, + to distinguish URLs with a trailing '?'. +* It handles scheme, hostname, port, and path normalization. +* It supports IDNA hostnames, normalizing them to their encoded form. +* The API supports passing individual components, as well as the complete URL string. + +Previously we relied on the excellent `rfc3986` package to handle URL parsing and +validation, but this module provides a simpler alternative, with less indirection +required. +""" + +from __future__ import annotations + +import ipaddress +import re +import typing + +import idna + +from ._exceptions import InvalidURL + +MAX_URL_LENGTH = 65536 + +# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 +UNRESERVED_CHARACTERS = ( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" +) +SUB_DELIMS = "!$&'()*+,;=" + +PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") + +# https://url.spec.whatwg.org/#percent-encoded-bytes + +# The fragment percent-encode set is the C0 control percent-encode set +# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`). +FRAG_SAFE = "".join( + [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)] +) + +# The query percent-encode set is the C0 control percent-encode set +# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>). +QUERY_SAFE = "".join( + [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)] +) + +# The path percent-encode set is the query percent-encode set +# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}). +PATH_SAFE = "".join( + [ + chr(i) + for i in range(0x20, 0x7F) + if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D) + ] +) + +# The userinfo percent-encode set is the path percent-encode set +# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@), +# U+005B ([) to U+005E (^), inclusive, and U+007C (|). +USERNAME_SAFE = "".join( + [ + chr(i) + for i in range(0x20, 0x7F) + if i + not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + + (0x3F, 0x60, 0x7B, 0x7D) + + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) + ] +) +PASSWORD_SAFE = "".join( + [ + chr(i) + for i in range(0x20, 0x7F) + if i + not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + + (0x3F, 0x60, 0x7B, 0x7D) + + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) + ] +) +# Note... The terminology 'userinfo' percent-encode set in the WHATWG document +# is used for the username and password quoting. For the joint userinfo component +# we remove U+003A (:) from the safe set. +USERINFO_SAFE = "".join( + [ + chr(i) + for i in range(0x20, 0x7F) + if i + not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + + (0x3F, 0x60, 0x7B, 0x7D) + + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) + ] +) + + +# {scheme}: (optional) +# //{authority} (optional) +# {path} +# ?{query} (optional) +# #{fragment} (optional) +URL_REGEX = re.compile( + ( + r"(?:(?P{scheme}):)?" + r"(?://(?P{authority}))?" + r"(?P{path})" + r"(?:\?(?P{query}))?" + r"(?:#(?P{fragment}))?" + ).format( + scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", + authority="[^/?#]*", + path="[^?#]*", + query="[^#]*", + fragment=".*", + ) +) + +# {userinfo}@ (optional) +# {host} +# :{port} (optional) +AUTHORITY_REGEX = re.compile( + ( + r"(?:(?P{userinfo})@)?" r"(?P{host})" r":?(?P{port})?" + ).format( + userinfo=".*", # Any character sequence. + host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', + # or an IPv6 address enclosed within square brackets. + port=".*", # Any character sequence. + ) +) + + +# If we call urlparse with an individual component, then we need to regex +# validate that component individually. +# Note that we're duplicating the same strings as above. Shock! Horror!! +COMPONENT_REGEX = { + "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), + "authority": re.compile("[^/?#]*"), + "path": re.compile("[^?#]*"), + "query": re.compile("[^#]*"), + "fragment": re.compile(".*"), + "userinfo": re.compile("[^@]*"), + "host": re.compile("(\\[.*\\]|[^:]*)"), + "port": re.compile(".*"), +} + + +# We use these simple regexs as a first pass before handing off to +# the stdlib 'ipaddress' module for IP address validation. +IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") +IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") + + +class ParseResult(typing.NamedTuple): + scheme: str + userinfo: str + host: str + port: int | None + path: str + query: str | None + fragment: str | None + + @property + def authority(self) -> str: + return "".join( + [ + f"{self.userinfo}@" if self.userinfo else "", + f"[{self.host}]" if ":" in self.host else self.host, + f":{self.port}" if self.port is not None else "", + ] + ) + + @property + def netloc(self) -> str: + return "".join( + [ + f"[{self.host}]" if ":" in self.host else self.host, + f":{self.port}" if self.port is not None else "", + ] + ) + + def copy_with(self, **kwargs: str | None) -> ParseResult: + if not kwargs: + return self + + defaults = { + "scheme": self.scheme, + "authority": self.authority, + "path": self.path, + "query": self.query, + "fragment": self.fragment, + } + defaults.update(kwargs) + return urlparse("", **defaults) + + def __str__(self) -> str: + authority = self.authority + return "".join( + [ + f"{self.scheme}:" if self.scheme else "", + f"//{authority}" if authority else "", + self.path, + f"?{self.query}" if self.query is not None else "", + f"#{self.fragment}" if self.fragment is not None else "", + ] + ) + + +def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: + # Initial basic checks on allowable URLs. + # --------------------------------------- + + # Hard limit the maximum allowable URL length. + if len(url) > MAX_URL_LENGTH: + raise InvalidURL("URL too long") + + # If a URL includes any ASCII control characters including \t, \r, \n, + # then treat it as invalid. + if any(char.isascii() and not char.isprintable() for char in url): + char = next(char for char in url if char.isascii() and not char.isprintable()) + idx = url.find(char) + error = ( + f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}." + ) + raise InvalidURL(error) + + # Some keyword arguments require special handling. + # ------------------------------------------------ + + # Coerce "port" to a string, if it is provided as an integer. + if "port" in kwargs: + port = kwargs["port"] + kwargs["port"] = str(port) if isinstance(port, int) else port + + # Replace "netloc" with "host and "port". + if "netloc" in kwargs: + netloc = kwargs.pop("netloc") or "" + kwargs["host"], _, kwargs["port"] = netloc.partition(":") + + # Replace "username" and/or "password" with "userinfo". + if "username" in kwargs or "password" in kwargs: + username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE) + password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE) + kwargs["userinfo"] = f"{username}:{password}" if password else username + + # Replace "raw_path" with "path" and "query". + if "raw_path" in kwargs: + raw_path = kwargs.pop("raw_path") or "" + kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") + if not seperator: + kwargs["query"] = None + + # Ensure that IPv6 "host" addresses are always escaped with "[...]". + if "host" in kwargs: + host = kwargs.get("host") or "" + if ":" in host and not (host.startswith("[") and host.endswith("]")): + kwargs["host"] = f"[{host}]" + + # If any keyword arguments are provided, ensure they are valid. + # ------------------------------------------------------------- + + for key, value in kwargs.items(): + if value is not None: + if len(value) > MAX_URL_LENGTH: + raise InvalidURL(f"URL component '{key}' too long") + + # If a component includes any ASCII control characters including \t, \r, \n, + # then treat it as invalid. + if any(char.isascii() and not char.isprintable() for char in value): + char = next( + char for char in value if char.isascii() and not char.isprintable() + ) + idx = value.find(char) + error = ( + f"Invalid non-printable ASCII character in URL {key} component, " + f"{char!r} at position {idx}." + ) + raise InvalidURL(error) + + # Ensure that keyword arguments match as a valid regex. + if not COMPONENT_REGEX[key].fullmatch(value): + raise InvalidURL(f"Invalid URL component '{key}'") + + # The URL_REGEX will always match, but may have empty components. + url_match = URL_REGEX.match(url) + assert url_match is not None + url_dict = url_match.groupdict() + + # * 'scheme', 'authority', and 'path' may be empty strings. + # * 'query' may be 'None', indicating no trailing "?" portion. + # Any string including the empty string, indicates a trailing "?". + # * 'fragment' may be 'None', indicating no trailing "#" portion. + # Any string including the empty string, indicates a trailing "#". + scheme = kwargs.get("scheme", url_dict["scheme"]) or "" + authority = kwargs.get("authority", url_dict["authority"]) or "" + path = kwargs.get("path", url_dict["path"]) or "" + query = kwargs.get("query", url_dict["query"]) + frag = kwargs.get("fragment", url_dict["fragment"]) + + # The AUTHORITY_REGEX will always match, but may have empty components. + authority_match = AUTHORITY_REGEX.match(authority) + assert authority_match is not None + authority_dict = authority_match.groupdict() + + # * 'userinfo' and 'host' may be empty strings. + # * 'port' may be 'None'. + userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" + host = kwargs.get("host", authority_dict["host"]) or "" + port = kwargs.get("port", authority_dict["port"]) + + # Normalize and validate each component. + # We end up with a parsed representation of the URL, + # with components that are plain ASCII bytestrings. + parsed_scheme: str = scheme.lower() + parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE) + parsed_host: str = encode_host(host) + parsed_port: int | None = normalize_port(port, scheme) + + has_scheme = parsed_scheme != "" + has_authority = ( + parsed_userinfo != "" or parsed_host != "" or parsed_port is not None + ) + validate_path(path, has_scheme=has_scheme, has_authority=has_authority) + if has_scheme or has_authority: + path = normalize_path(path) + + parsed_path: str = quote(path, safe=PATH_SAFE) + parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE) + parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE) + + # The parsed ASCII bytestrings are our canonical form. + # All properties of the URL are derived from these. + return ParseResult( + parsed_scheme, + parsed_userinfo, + parsed_host, + parsed_port, + parsed_path, + parsed_query, + parsed_frag, + ) + + +def encode_host(host: str) -> str: + if not host: + return "" + + elif IPv4_STYLE_HOSTNAME.match(host): + # Validate IPv4 hostnames like #.#.#.# + # + # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 + # + # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + try: + ipaddress.IPv4Address(host) + except ipaddress.AddressValueError: + raise InvalidURL(f"Invalid IPv4 address: {host!r}") + return host + + elif IPv6_STYLE_HOSTNAME.match(host): + # Validate IPv6 hostnames like [...] + # + # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 + # + # "A host identified by an Internet Protocol literal address, version 6 + # [RFC3513] or later, is distinguished by enclosing the IP literal + # within square brackets ("[" and "]"). This is the only place where + # square bracket characters are allowed in the URI syntax." + try: + ipaddress.IPv6Address(host[1:-1]) + except ipaddress.AddressValueError: + raise InvalidURL(f"Invalid IPv6 address: {host!r}") + return host[1:-1] + + elif host.isascii(): + # Regular ASCII hostnames + # + # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 + # + # reg-name = *( unreserved / pct-encoded / sub-delims ) + WHATWG_SAFE = '"`{}%|\\' + return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE) + + # IDNA hostnames + try: + return idna.encode(host.lower()).decode("ascii") + except idna.IDNAError: + raise InvalidURL(f"Invalid IDNA hostname: {host!r}") + + +def normalize_port(port: str | int | None, scheme: str) -> int | None: + # From https://tools.ietf.org/html/rfc3986#section-3.2.3 + # + # "A scheme may define a default port. For example, the "http" scheme + # defines a default port of "80", corresponding to its reserved TCP + # port number. The type of port designated by the port number (e.g., + # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and + # normalizers should omit the port component and its ":" delimiter if + # port is empty or if its value would be the same as that of the + # scheme's default." + if port is None or port == "": + return None + + try: + port_as_int = int(port) + except ValueError: + raise InvalidURL(f"Invalid port: {port!r}") + + # See https://url.spec.whatwg.org/#url-miscellaneous + default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( + scheme + ) + if port_as_int == default_port: + return None + return port_as_int + + +def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: + """ + Path validation rules that depend on if the URL contains + a scheme or authority component. + + See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 + """ + if has_authority: + # If a URI contains an authority component, then the path component + # must either be empty or begin with a slash ("/") character." + if path and not path.startswith("/"): + raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") + + if not has_scheme and not has_authority: + # If a URI does not contain an authority component, then the path cannot begin + # with two slash characters ("//"). + if path.startswith("//"): + raise InvalidURL("Relative URLs cannot have a path starting with '//'") + + # In addition, a URI reference (Section 4.1) may be a relative-path reference, + # in which case the first path segment cannot contain a colon (":") character. + if path.startswith(":"): + raise InvalidURL("Relative URLs cannot have a path starting with ':'") + + +def normalize_path(path: str) -> str: + """ + Drop "." and ".." segments from a URL path. + + For example: + + normalize_path("/path/./to/somewhere/..") == "/path/to" + """ + # Fast return when no '.' characters in the path. + if "." not in path: + return path + + components = path.split("/") + + # Fast return when no '.' or '..' components in the path. + if "." not in components and ".." not in components: + return path + + # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 + output: list[str] = [] + for component in components: + if component == ".": + pass + elif component == "..": + if output and output != [""]: + output.pop() + else: + output.append(component) + return "/".join(output) + + +def PERCENT(string: str) -> str: + return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")]) + + +def percent_encoded(string: str, safe: str) -> str: + """ + Use percent-encoding to quote a string. + """ + NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + + # Fast path for strings that don't need escaping. + if not string.rstrip(NON_ESCAPED_CHARS): + return string + + return "".join( + [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string] + ) + + +def quote(string: str, safe: str) -> str: + """ + Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. + + See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 + + * `string`: The string to be percent-escaped. + * `safe`: A string containing characters that may be treated as safe, and do not + need to be escaped. Unreserved characters are always treated as safe. + See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 + """ + parts = [] + current_position = 0 + for match in re.finditer(PERCENT_ENCODED_REGEX, string): + start_position, end_position = match.start(), match.end() + matched_text = match.group(0) + # Add any text up to the '%xx' escape sequence. + if start_position != current_position: + leading_text = string[current_position:start_position] + parts.append(percent_encoded(leading_text, safe=safe)) + + # Add the '%xx' escape sequence. + parts.append(matched_text) + current_position = end_position + + # Add any text after the final '%xx' escape sequence. + if current_position != len(string): + trailing_text = string[current_position:] + parts.append(percent_encoded(trailing_text, safe=safe)) + + return "".join(parts) diff --git a/env/lib/python3.13/site-packages/httpx/_urls.py b/env/lib/python3.13/site-packages/httpx/_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..147a8fa333acaf31618d37ba2896e3a5bf5e4d02 --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_urls.py @@ -0,0 +1,641 @@ +from __future__ import annotations + +import typing +from urllib.parse import parse_qs, unquote, urlencode + +import idna + +from ._types import QueryParamTypes +from ._urlparse import urlparse +from ._utils import primitive_value_to_str + +__all__ = ["URL", "QueryParams"] + + +class URL: + """ + url = httpx.URL("HTTPS://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink") + + assert url.scheme == "https" + assert url.username == "jo@email.com" + assert url.password == "a secret" + assert url.userinfo == b"jo%40email.com:a%20secret" + assert url.host == "müller.de" + assert url.raw_host == b"xn--mller-kva.de" + assert url.port == 1234 + assert url.netloc == b"xn--mller-kva.de:1234" + assert url.path == "/pa th" + assert url.query == b"?search=ab" + assert url.raw_path == b"/pa%20th?search=ab" + assert url.fragment == "anchorlink" + + The components of a URL are broken down like this: + + https://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink + [scheme] [ username ] [password] [ host ][port][ path ] [ query ] [fragment] + [ userinfo ] [ netloc ][ raw_path ] + + Note that: + + * `url.scheme` is normalized to always be lowercased. + + * `url.host` is normalized to always be lowercased. Internationalized domain + names are represented in unicode, without IDNA encoding applied. For instance: + + url = httpx.URL("http://中国.icom.museum") + assert url.host == "中国.icom.museum" + url = httpx.URL("http://xn--fiqs8s.icom.museum") + assert url.host == "中国.icom.museum" + + * `url.raw_host` is normalized to always be lowercased, and is IDNA encoded. + + url = httpx.URL("http://中国.icom.museum") + assert url.raw_host == b"xn--fiqs8s.icom.museum" + url = httpx.URL("http://xn--fiqs8s.icom.museum") + assert url.raw_host == b"xn--fiqs8s.icom.museum" + + * `url.port` is either None or an integer. URLs that include the default port for + "http", "https", "ws", "wss", and "ftp" schemes have their port + normalized to `None`. + + assert httpx.URL("http://example.com") == httpx.URL("http://example.com:80") + assert httpx.URL("http://example.com").port is None + assert httpx.URL("http://example.com:80").port is None + + * `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work + with `url.username` and `url.password` instead, which handle the URL escaping. + + * `url.raw_path` is raw bytes of both the path and query, without URL escaping. + This portion is used as the target when constructing HTTP requests. Usually you'll + want to work with `url.path` instead. + + * `url.query` is raw bytes, without URL escaping. A URL query string portion can + only be properly URL escaped when decoding the parameter names and values + themselves. + """ + + def __init__(self, url: URL | str = "", **kwargs: typing.Any) -> None: + if kwargs: + allowed = { + "scheme": str, + "username": str, + "password": str, + "userinfo": bytes, + "host": str, + "port": int, + "netloc": bytes, + "path": str, + "query": bytes, + "raw_path": bytes, + "fragment": str, + "params": object, + } + + # Perform type checking for all supported keyword arguments. + for key, value in kwargs.items(): + if key not in allowed: + message = f"{key!r} is an invalid keyword argument for URL()" + raise TypeError(message) + if value is not None and not isinstance(value, allowed[key]): + expected = allowed[key].__name__ + seen = type(value).__name__ + message = f"Argument {key!r} must be {expected} but got {seen}" + raise TypeError(message) + if isinstance(value, bytes): + kwargs[key] = value.decode("ascii") + + if "params" in kwargs: + # Replace any "params" keyword with the raw "query" instead. + # + # Ensure that empty params use `kwargs["query"] = None` rather + # than `kwargs["query"] = ""`, so that generated URLs do not + # include an empty trailing "?". + params = kwargs.pop("params") + kwargs["query"] = None if not params else str(QueryParams(params)) + + if isinstance(url, str): + self._uri_reference = urlparse(url, **kwargs) + elif isinstance(url, URL): + self._uri_reference = url._uri_reference.copy_with(**kwargs) + else: + raise TypeError( + "Invalid type for url. Expected str or httpx.URL," + f" got {type(url)}: {url!r}" + ) + + @property + def scheme(self) -> str: + """ + The URL scheme, such as "http", "https". + Always normalised to lowercase. + """ + return self._uri_reference.scheme + + @property + def raw_scheme(self) -> bytes: + """ + The raw bytes representation of the URL scheme, such as b"http", b"https". + Always normalised to lowercase. + """ + return self._uri_reference.scheme.encode("ascii") + + @property + def userinfo(self) -> bytes: + """ + The URL userinfo as a raw bytestring. + For example: b"jo%40email.com:a%20secret". + """ + return self._uri_reference.userinfo.encode("ascii") + + @property + def username(self) -> str: + """ + The URL username as a string, with URL decoding applied. + For example: "jo@email.com" + """ + userinfo = self._uri_reference.userinfo + return unquote(userinfo.partition(":")[0]) + + @property + def password(self) -> str: + """ + The URL password as a string, with URL decoding applied. + For example: "a secret" + """ + userinfo = self._uri_reference.userinfo + return unquote(userinfo.partition(":")[2]) + + @property + def host(self) -> str: + """ + The URL host as a string. + Always normalized to lowercase, with IDNA hosts decoded into unicode. + + Examples: + + url = httpx.URL("http://www.EXAMPLE.org") + assert url.host == "www.example.org" + + url = httpx.URL("http://中国.icom.museum") + assert url.host == "中国.icom.museum" + + url = httpx.URL("http://xn--fiqs8s.icom.museum") + assert url.host == "中国.icom.museum" + + url = httpx.URL("https://[::ffff:192.168.0.1]") + assert url.host == "::ffff:192.168.0.1" + """ + host: str = self._uri_reference.host + + if host.startswith("xn--"): + host = idna.decode(host) + + return host + + @property + def raw_host(self) -> bytes: + """ + The raw bytes representation of the URL host. + Always normalized to lowercase, and IDNA encoded. + + Examples: + + url = httpx.URL("http://www.EXAMPLE.org") + assert url.raw_host == b"www.example.org" + + url = httpx.URL("http://中国.icom.museum") + assert url.raw_host == b"xn--fiqs8s.icom.museum" + + url = httpx.URL("http://xn--fiqs8s.icom.museum") + assert url.raw_host == b"xn--fiqs8s.icom.museum" + + url = httpx.URL("https://[::ffff:192.168.0.1]") + assert url.raw_host == b"::ffff:192.168.0.1" + """ + return self._uri_reference.host.encode("ascii") + + @property + def port(self) -> int | None: + """ + The URL port as an integer. + + Note that the URL class performs port normalization as per the WHATWG spec. + Default ports for "http", "https", "ws", "wss", and "ftp" schemes are always + treated as `None`. + + For example: + + assert httpx.URL("http://www.example.com") == httpx.URL("http://www.example.com:80") + assert httpx.URL("http://www.example.com:80").port is None + """ + return self._uri_reference.port + + @property + def netloc(self) -> bytes: + """ + Either `` or `:` as bytes. + Always normalized to lowercase, and IDNA encoded. + + This property may be used for generating the value of a request + "Host" header. + """ + return self._uri_reference.netloc.encode("ascii") + + @property + def path(self) -> str: + """ + The URL path as a string. Excluding the query string, and URL decoded. + + For example: + + url = httpx.URL("https://example.com/pa%20th") + assert url.path == "/pa th" + """ + path = self._uri_reference.path or "/" + return unquote(path) + + @property + def query(self) -> bytes: + """ + The URL query string, as raw bytes, excluding the leading b"?". + + This is necessarily a bytewise interface, because we cannot + perform URL decoding of this representation until we've parsed + the keys and values into a QueryParams instance. + + For example: + + url = httpx.URL("https://example.com/?filter=some%20search%20terms") + assert url.query == b"filter=some%20search%20terms" + """ + query = self._uri_reference.query or "" + return query.encode("ascii") + + @property + def params(self) -> QueryParams: + """ + The URL query parameters, neatly parsed and packaged into an immutable + multidict representation. + """ + return QueryParams(self._uri_reference.query) + + @property + def raw_path(self) -> bytes: + """ + The complete URL path and query string as raw bytes. + Used as the target when constructing HTTP requests. + + For example: + + GET /users?search=some%20text HTTP/1.1 + Host: www.example.org + Connection: close + """ + path = self._uri_reference.path or "/" + if self._uri_reference.query is not None: + path += "?" + self._uri_reference.query + return path.encode("ascii") + + @property + def fragment(self) -> str: + """ + The URL fragments, as used in HTML anchors. + As a string, without the leading '#'. + """ + return unquote(self._uri_reference.fragment or "") + + @property + def is_absolute_url(self) -> bool: + """ + Return `True` for absolute URLs such as 'http://example.com/path', + and `False` for relative URLs such as '/path'. + """ + # We don't use `.is_absolute` from `rfc3986` because it treats + # URLs with a fragment portion as not absolute. + # What we actually care about is if the URL provides + # a scheme and hostname to which connections should be made. + return bool(self._uri_reference.scheme and self._uri_reference.host) + + @property + def is_relative_url(self) -> bool: + """ + Return `False` for absolute URLs such as 'http://example.com/path', + and `True` for relative URLs such as '/path'. + """ + return not self.is_absolute_url + + def copy_with(self, **kwargs: typing.Any) -> URL: + """ + Copy this URL, returning a new URL with some components altered. + Accepts the same set of parameters as the components that are made + available via properties on the `URL` class. + + For example: + + url = httpx.URL("https://www.example.com").copy_with( + username="jo@gmail.com", password="a secret" + ) + assert url == "https://jo%40email.com:a%20secret@www.example.com" + """ + return URL(self, **kwargs) + + def copy_set_param(self, key: str, value: typing.Any = None) -> URL: + return self.copy_with(params=self.params.set(key, value)) + + def copy_add_param(self, key: str, value: typing.Any = None) -> URL: + return self.copy_with(params=self.params.add(key, value)) + + def copy_remove_param(self, key: str) -> URL: + return self.copy_with(params=self.params.remove(key)) + + def copy_merge_params(self, params: QueryParamTypes) -> URL: + return self.copy_with(params=self.params.merge(params)) + + def join(self, url: URL | str) -> URL: + """ + Return an absolute URL, using this URL as the base. + + Eg. + + url = httpx.URL("https://www.example.com/test") + url = url.join("/new/path") + assert url == "https://www.example.com/new/path" + """ + from urllib.parse import urljoin + + return URL(urljoin(str(self), str(URL(url)))) + + def __hash__(self) -> int: + return hash(str(self)) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, (URL, str)) and str(self) == str(URL(other)) + + def __str__(self) -> str: + return str(self._uri_reference) + + def __repr__(self) -> str: + scheme, userinfo, host, port, path, query, fragment = self._uri_reference + + if ":" in userinfo: + # Mask any password component. + userinfo = f'{userinfo.split(":")[0]}:[secure]' + + authority = "".join( + [ + f"{userinfo}@" if userinfo else "", + f"[{host}]" if ":" in host else host, + f":{port}" if port is not None else "", + ] + ) + url = "".join( + [ + f"{self.scheme}:" if scheme else "", + f"//{authority}" if authority else "", + path, + f"?{query}" if query is not None else "", + f"#{fragment}" if fragment is not None else "", + ] + ) + + return f"{self.__class__.__name__}({url!r})" + + @property + def raw(self) -> tuple[bytes, bytes, int, bytes]: # pragma: nocover + import collections + import warnings + + warnings.warn("URL.raw is deprecated.") + RawURL = collections.namedtuple( + "RawURL", ["raw_scheme", "raw_host", "port", "raw_path"] + ) + return RawURL( + raw_scheme=self.raw_scheme, + raw_host=self.raw_host, + port=self.port, + raw_path=self.raw_path, + ) + + +class QueryParams(typing.Mapping[str, str]): + """ + URL query parameters, as a multi-dict. + """ + + def __init__(self, *args: QueryParamTypes | None, **kwargs: typing.Any) -> None: + assert len(args) < 2, "Too many arguments." + assert not (args and kwargs), "Cannot mix named and unnamed arguments." + + value = args[0] if args else kwargs + + if value is None or isinstance(value, (str, bytes)): + value = value.decode("ascii") if isinstance(value, bytes) else value + self._dict = parse_qs(value, keep_blank_values=True) + elif isinstance(value, QueryParams): + self._dict = {k: list(v) for k, v in value._dict.items()} + else: + dict_value: dict[typing.Any, list[typing.Any]] = {} + if isinstance(value, (list, tuple)): + # Convert list inputs like: + # [("a", "123"), ("a", "456"), ("b", "789")] + # To a dict representation, like: + # {"a": ["123", "456"], "b": ["789"]} + for item in value: + dict_value.setdefault(item[0], []).append(item[1]) + else: + # Convert dict inputs like: + # {"a": "123", "b": ["456", "789"]} + # To dict inputs where values are always lists, like: + # {"a": ["123"], "b": ["456", "789"]} + dict_value = { + k: list(v) if isinstance(v, (list, tuple)) else [v] + for k, v in value.items() + } + + # Ensure that keys and values are neatly coerced to strings. + # We coerce values `True` and `False` to JSON-like "true" and "false" + # representations, and coerce `None` values to the empty string. + self._dict = { + str(k): [primitive_value_to_str(item) for item in v] + for k, v in dict_value.items() + } + + def keys(self) -> typing.KeysView[str]: + """ + Return all the keys in the query params. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert list(q.keys()) == ["a", "b"] + """ + return self._dict.keys() + + def values(self) -> typing.ValuesView[str]: + """ + Return all the values in the query params. If a key occurs more than once + only the first item for that key is returned. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert list(q.values()) == ["123", "789"] + """ + return {k: v[0] for k, v in self._dict.items()}.values() + + def items(self) -> typing.ItemsView[str, str]: + """ + Return all items in the query params. If a key occurs more than once + only the first item for that key is returned. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert list(q.items()) == [("a", "123"), ("b", "789")] + """ + return {k: v[0] for k, v in self._dict.items()}.items() + + def multi_items(self) -> list[tuple[str, str]]: + """ + Return all items in the query params. Allow duplicate keys to occur. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert list(q.multi_items()) == [("a", "123"), ("a", "456"), ("b", "789")] + """ + multi_items: list[tuple[str, str]] = [] + for k, v in self._dict.items(): + multi_items.extend([(k, i) for i in v]) + return multi_items + + def get(self, key: typing.Any, default: typing.Any = None) -> typing.Any: + """ + Get a value from the query param for a given key. If the key occurs + more than once, then only the first value is returned. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert q.get("a") == "123" + """ + if key in self._dict: + return self._dict[str(key)][0] + return default + + def get_list(self, key: str) -> list[str]: + """ + Get all values from the query param for a given key. + + Usage: + + q = httpx.QueryParams("a=123&a=456&b=789") + assert q.get_list("a") == ["123", "456"] + """ + return list(self._dict.get(str(key), [])) + + def set(self, key: str, value: typing.Any = None) -> QueryParams: + """ + Return a new QueryParams instance, setting the value of a key. + + Usage: + + q = httpx.QueryParams("a=123") + q = q.set("a", "456") + assert q == httpx.QueryParams("a=456") + """ + q = QueryParams() + q._dict = dict(self._dict) + q._dict[str(key)] = [primitive_value_to_str(value)] + return q + + def add(self, key: str, value: typing.Any = None) -> QueryParams: + """ + Return a new QueryParams instance, setting or appending the value of a key. + + Usage: + + q = httpx.QueryParams("a=123") + q = q.add("a", "456") + assert q == httpx.QueryParams("a=123&a=456") + """ + q = QueryParams() + q._dict = dict(self._dict) + q._dict[str(key)] = q.get_list(key) + [primitive_value_to_str(value)] + return q + + def remove(self, key: str) -> QueryParams: + """ + Return a new QueryParams instance, removing the value of a key. + + Usage: + + q = httpx.QueryParams("a=123") + q = q.remove("a") + assert q == httpx.QueryParams("") + """ + q = QueryParams() + q._dict = dict(self._dict) + q._dict.pop(str(key), None) + return q + + def merge(self, params: QueryParamTypes | None = None) -> QueryParams: + """ + Return a new QueryParams instance, updated with. + + Usage: + + q = httpx.QueryParams("a=123") + q = q.merge({"b": "456"}) + assert q == httpx.QueryParams("a=123&b=456") + + q = httpx.QueryParams("a=123") + q = q.merge({"a": "456", "b": "789"}) + assert q == httpx.QueryParams("a=456&b=789") + """ + q = QueryParams(params) + q._dict = {**self._dict, **q._dict} + return q + + def __getitem__(self, key: typing.Any) -> str: + return self._dict[key][0] + + def __contains__(self, key: typing.Any) -> bool: + return key in self._dict + + def __iter__(self) -> typing.Iterator[typing.Any]: + return iter(self.keys()) + + def __len__(self) -> int: + return len(self._dict) + + def __bool__(self) -> bool: + return bool(self._dict) + + def __hash__(self) -> int: + return hash(str(self)) + + def __eq__(self, other: typing.Any) -> bool: + if not isinstance(other, self.__class__): + return False + return sorted(self.multi_items()) == sorted(other.multi_items()) + + def __str__(self) -> str: + return urlencode(self.multi_items()) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + query_string = str(self) + return f"{class_name}({query_string!r})" + + def update(self, params: QueryParamTypes | None = None) -> None: + raise RuntimeError( + "QueryParams are immutable since 0.18.0. " + "Use `q = q.merge(...)` to create an updated copy." + ) + + def __setitem__(self, key: str, value: str) -> None: + raise RuntimeError( + "QueryParams are immutable since 0.18.0. " + "Use `q = q.set(key, value)` to create an updated copy." + ) diff --git a/env/lib/python3.13/site-packages/httpx/_utils.py b/env/lib/python3.13/site-packages/httpx/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe827da4d071b32ea6da44328629699d6fc88ce --- /dev/null +++ b/env/lib/python3.13/site-packages/httpx/_utils.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import ipaddress +import os +import re +import typing +from urllib.request import getproxies + +from ._types import PrimitiveData + +if typing.TYPE_CHECKING: # pragma: no cover + from ._urls import URL + + +def primitive_value_to_str(value: PrimitiveData) -> str: + """ + Coerce a primitive data type into a string value. + + Note that we prefer JSON-style 'true'/'false' for boolean values here. + """ + if value is True: + return "true" + elif value is False: + return "false" + elif value is None: + return "" + return str(value) + + +def get_environment_proxies() -> dict[str, str | None]: + """Gets proxy information from the environment""" + + # urllib.request.getproxies() falls back on System + # Registry and Config for proxies on Windows and macOS. + # We don't want to propagate non-HTTP proxies into + # our configuration such as 'TRAVIS_APT_PROXY'. + proxy_info = getproxies() + mounts: dict[str, str | None] = {} + + for scheme in ("http", "https", "all"): + if proxy_info.get(scheme): + hostname = proxy_info[scheme] + mounts[f"{scheme}://"] = ( + hostname if "://" in hostname else f"http://{hostname}" + ) + + no_proxy_hosts = [host.strip() for host in proxy_info.get("no", "").split(",")] + for hostname in no_proxy_hosts: + # See https://curl.haxx.se/libcurl/c/CURLOPT_NOPROXY.html for details + # on how names in `NO_PROXY` are handled. + if hostname == "*": + # If NO_PROXY=* is used or if "*" occurs as any one of the comma + # separated hostnames, then we should just bypass any information + # from HTTP_PROXY, HTTPS_PROXY, ALL_PROXY, and always ignore + # proxies. + return {} + elif hostname: + # NO_PROXY=.google.com is marked as "all://*.google.com, + # which disables "www.google.com" but not "google.com" + # NO_PROXY=google.com is marked as "all://*google.com, + # which disables "www.google.com" and "google.com". + # (But not "wwwgoogle.com") + # NO_PROXY can include domains, IPv6, IPv4 addresses and "localhost" + # NO_PROXY=example.com,::1,localhost,192.168.0.0/16 + if "://" in hostname: + mounts[hostname] = None + elif is_ipv4_hostname(hostname): + mounts[f"all://{hostname}"] = None + elif is_ipv6_hostname(hostname): + mounts[f"all://[{hostname}]"] = None + elif hostname.lower() == "localhost": + mounts[f"all://{hostname}"] = None + else: + mounts[f"all://*{hostname}"] = None + + return mounts + + +def to_bytes(value: str | bytes, encoding: str = "utf-8") -> bytes: + return value.encode(encoding) if isinstance(value, str) else value + + +def to_str(value: str | bytes, encoding: str = "utf-8") -> str: + return value if isinstance(value, str) else value.decode(encoding) + + +def to_bytes_or_str(value: str, match_type_of: typing.AnyStr) -> typing.AnyStr: + return value if isinstance(match_type_of, str) else value.encode() + + +def unquote(value: str) -> str: + return value[1:-1] if value[0] == value[-1] == '"' else value + + +def peek_filelike_length(stream: typing.Any) -> int | None: + """ + Given a file-like stream object, return its length in number of bytes + without reading it into memory. + """ + try: + # Is it an actual file? + fd = stream.fileno() + # Yup, seems to be an actual file. + length = os.fstat(fd).st_size + except (AttributeError, OSError): + # No... Maybe it's something that supports random access, like `io.BytesIO`? + try: + # Assuming so, go to end of stream to figure out its length, + # then put it back in place. + offset = stream.tell() + length = stream.seek(0, os.SEEK_END) + stream.seek(offset) + except (AttributeError, OSError): + # Not even that? Sorry, we're doomed... + return None + + return length + + +class URLPattern: + """ + A utility class currently used for making lookups against proxy keys... + + # Wildcard matching... + >>> pattern = URLPattern("all://") + >>> pattern.matches(httpx.URL("http://example.com")) + True + + # Witch scheme matching... + >>> pattern = URLPattern("https://") + >>> pattern.matches(httpx.URL("https://example.com")) + True + >>> pattern.matches(httpx.URL("http://example.com")) + False + + # With domain matching... + >>> pattern = URLPattern("https://example.com") + >>> pattern.matches(httpx.URL("https://example.com")) + True + >>> pattern.matches(httpx.URL("http://example.com")) + False + >>> pattern.matches(httpx.URL("https://other.com")) + False + + # Wildcard scheme, with domain matching... + >>> pattern = URLPattern("all://example.com") + >>> pattern.matches(httpx.URL("https://example.com")) + True + >>> pattern.matches(httpx.URL("http://example.com")) + True + >>> pattern.matches(httpx.URL("https://other.com")) + False + + # With port matching... + >>> pattern = URLPattern("https://example.com:1234") + >>> pattern.matches(httpx.URL("https://example.com:1234")) + True + >>> pattern.matches(httpx.URL("https://example.com")) + False + """ + + def __init__(self, pattern: str) -> None: + from ._urls import URL + + if pattern and ":" not in pattern: + raise ValueError( + f"Proxy keys should use proper URL forms rather " + f"than plain scheme strings. " + f'Instead of "{pattern}", use "{pattern}://"' + ) + + url = URL(pattern) + self.pattern = pattern + self.scheme = "" if url.scheme == "all" else url.scheme + self.host = "" if url.host == "*" else url.host + self.port = url.port + if not url.host or url.host == "*": + self.host_regex: typing.Pattern[str] | None = None + elif url.host.startswith("*."): + # *.example.com should match "www.example.com", but not "example.com" + domain = re.escape(url.host[2:]) + self.host_regex = re.compile(f"^.+\\.{domain}$") + elif url.host.startswith("*"): + # *example.com should match "www.example.com" and "example.com" + domain = re.escape(url.host[1:]) + self.host_regex = re.compile(f"^(.+\\.)?{domain}$") + else: + # example.com should match "example.com" but not "www.example.com" + domain = re.escape(url.host) + self.host_regex = re.compile(f"^{domain}$") + + def matches(self, other: URL) -> bool: + if self.scheme and self.scheme != other.scheme: + return False + if ( + self.host + and self.host_regex is not None + and not self.host_regex.match(other.host) + ): + return False + if self.port is not None and self.port != other.port: + return False + return True + + @property + def priority(self) -> tuple[int, int, int]: + """ + The priority allows URLPattern instances to be sortable, so that + we can match from most specific to least specific. + """ + # URLs with a port should take priority over URLs without a port. + port_priority = 0 if self.port is not None else 1 + # Longer hostnames should match first. + host_priority = -len(self.host) + # Longer schemes should match first. + scheme_priority = -len(self.scheme) + return (port_priority, host_priority, scheme_priority) + + def __hash__(self) -> int: + return hash(self.pattern) + + def __lt__(self, other: URLPattern) -> bool: + return self.priority < other.priority + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, URLPattern) and self.pattern == other.pattern + + +def is_ipv4_hostname(hostname: str) -> bool: + try: + ipaddress.IPv4Address(hostname.split("/")[0]) + except Exception: + return False + return True + + +def is_ipv6_hostname(hostname: str) -> bool: + try: + ipaddress.IPv6Address(hostname.split("/")[0]) + except Exception: + return False + return True diff --git a/env/lib/python3.13/site-packages/httpx/py.typed b/env/lib/python3.13/site-packages/httpx/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/env/lib/python3.13/site-packages/huggingface_hub/_login.py b/env/lib/python3.13/site-packages/huggingface_hub/_login.py new file mode 100644 index 0000000000000000000000000000000000000000..991432a5a9e78f3cda05cdc96b2f458425954b7f --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/_login.py @@ -0,0 +1,488 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains methods to log in to the Hub.""" + +import os +import subprocess +from getpass import getpass +from pathlib import Path +from typing import Optional + +import typer + +from . import constants +from .utils import ( + ANSI, + capture_output, + get_token, + is_google_colab, + is_notebook, + list_credential_helpers, + logging, + run_subprocess, + set_git_credential, + unset_git_credential, +) +from .utils._auth import ( + _get_token_by_name, + _get_token_from_environment, + _get_token_from_file, + _get_token_from_google_colab, + _save_stored_tokens, + _save_token, + get_stored_tokens, +) + + +logger = logging.get_logger(__name__) + +_HF_LOGO_ASCII = """ + _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| + _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| + _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| + _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| + _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| +""" + + +def login( + token: Optional[str] = None, + *, + add_to_git_credential: bool = False, + skip_if_logged_in: bool = False, +) -> None: + """Login the machine to access the Hub. + + The `token` is persisted in cache and set as a git credential. Once done, the machine + is logged in and the access token will be available across all `huggingface_hub` + components. If `token` is not provided, it will be prompted to the user either with + a widget (in a notebook) or via the terminal. + + To log in from outside of a script, one can also use `hf auth login` which is + a cli command that wraps [`login`]. + + > [!TIP] + > [`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and + > extends its capabilities. + + > [!TIP] + > When the token is not passed, [`login`] will automatically detect if the script runs + > in a notebook or not. However, this detection might not be accurate due to the + > variety of notebooks that exists nowadays. If that is the case, you can always force + > the UI by using [`notebook_login`] or [`interpreter_login`]. + + Args: + token (`str`, *optional*): + User access token to generate from https://huggingface.co/settings/token. + add_to_git_credential (`bool`, defaults to `False`): + If `True`, token will be set as git credential. If no git credential helper + is configured, a warning will be displayed to the user. If `token` is `None`, + the value of `add_to_git_credential` is ignored and will be prompted again + to the end user. + skip_if_logged_in (`bool`, defaults to `False`): + If `True`, do not prompt for token if user is already logged in. + Raises: + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + If an organization token is passed. Only personal account tokens are valid + to log in. + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + If token is invalid. + [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError) + If running in a notebook but `ipywidgets` is not installed. + """ + if token is not None: + if not add_to_git_credential: + logger.info( + "The token has not been saved to the git credentials helper. Pass " + "`add_to_git_credential=True` in this function directly or " + "`--add-to-git-credential` if using via `hf`CLI if " + "you want to set the git credential as well." + ) + _login(token, add_to_git_credential=add_to_git_credential) + elif is_notebook(): + notebook_login(skip_if_logged_in=skip_if_logged_in) + else: + interpreter_login(skip_if_logged_in=skip_if_logged_in) + + +def logout(token_name: Optional[str] = None) -> None: + """Logout the machine from the Hub. + + Token is deleted from the machine and removed from git credential. + + Args: + token_name (`str`, *optional*): + Name of the access token to logout from. If `None`, will log out from all saved access tokens. + Raises: + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): + If the access token name is not found. + """ + if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens + logger.warning("Not logged in!") + return + if not token_name: + # Delete all saved access tokens and token + for file_path in (constants.HF_TOKEN_PATH, constants.HF_STORED_TOKENS_PATH): + try: + Path(file_path).unlink() + except FileNotFoundError: + pass + logger.info("Successfully logged out from all access tokens.") + else: + _logout_from_token(token_name) + logger.info(f"Successfully logged out from access token: {token_name}.") + + unset_git_credential() + + # Check if still logged in + if _get_token_from_google_colab() is not None: + raise EnvironmentError( + "You are automatically logged in using a Google Colab secret.\n" + "To log out, you must unset the `HF_TOKEN` secret in your Colab settings." + ) + if _get_token_from_environment() is not None: + raise EnvironmentError( + "Token has been deleted from your machine but you are still logged in.\n" + "To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables." + ) + + +def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None: + """Switch to a different access token. + + Args: + token_name (`str`): + Name of the access token to switch to. + add_to_git_credential (`bool`, defaults to `False`): + If `True`, token will be set as git credential. If no git credential helper + is configured, a warning will be displayed to the user. If `token` is `None`, + the value of `add_to_git_credential` is ignored and will be prompted again + to the end user. + + Raises: + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): + If the access token name is not found. + """ + token = _get_token_by_name(token_name) + if not token: + raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}") + # Write token to HF_TOKEN_PATH + _set_active_token(token_name, add_to_git_credential) + logger.info(f"The current active token is: {token_name}") + token_from_environment = _get_token_from_environment() + if token_from_environment is not None and token_from_environment != token: + logger.warning( + "The environment variable `HF_TOKEN` is set and will override the access token you've just switched to." + ) + + +def auth_list() -> None: + """List all stored access tokens.""" + tokens = get_stored_tokens() + + if not tokens: + logger.info("No access tokens found.") + return + # Find current token + current_token = get_token() + current_token_name = None + for token_name in tokens: + if tokens.get(token_name) == current_token: + current_token_name = token_name + # Print header + max_offset = max(len("token"), max(len(token) for token in tokens)) + 2 + print(f" {{:<{max_offset}}}| {{:<15}}".format("name", "token")) + print("-" * (max_offset + 2) + "|" + "-" * 15) + + # Print saved access tokens + for token_name in tokens: + token = tokens.get(token_name, "") + masked_token = f"{token[:3]}****{token[-4:]}" if token != "" else token + is_current = "*" if token == current_token else " " + + print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token)) + + if _get_token_from_environment(): + logger.warning( + "\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above." + ) + elif current_token_name is None: + logger.warning( + "\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `hf auth login` to log in." + ) + + +### +# Interpreter-based login (text) +### + + +def interpreter_login(*, skip_if_logged_in: bool = False) -> None: + """ + Displays a prompt to log in to the HF website and store the token. + + This is equivalent to [`login`] without passing a token when not run in a notebook. + [`interpreter_login`] is useful if you want to force the use of the terminal prompt + instead of a notebook widget. + + For more details, see [`login`]. + + Args: + skip_if_logged_in (`bool`, defaults to `False`): + If `True`, do not prompt for token if user is already logged in. + """ + if not skip_if_logged_in and get_token() is not None: + logger.info("User is already logged in.") + return + + print(_HF_LOGO_ASCII) + if get_token() is not None: + logger.info( + " A token is already saved on your machine. Run `hf auth whoami`" + " to get more information or `hf auth logout` if you want" + " to log out." + ) + logger.info(" Setting a new token will erase the existing one.") + + logger.info( + " To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ." + ) + if os.name == "nt": + logger.info("Token can be pasted using 'Right-Click'.") + token = getpass("Enter your token (input will not be visible): ") + add_to_git_credential = typer.confirm("Add token as git credential?") + + _login(token=token, add_to_git_credential=add_to_git_credential) + + +### +# Notebook-based login (widget) +### + +NOTEBOOK_LOGIN_PASSWORD_HTML = """

Immediately click login after typing your password or +it might be stored in plain text in this notebook file.
""" + + +NOTEBOOK_LOGIN_TOKEN_HTML_START = """

Copy a token from your Hugging Face +tokens page and paste it below.
Immediately click login after copying +your token or it might be stored in plain text in this notebook file.
""" + + +NOTEBOOK_LOGIN_TOKEN_HTML_END = """ +Pro Tip: If you don't already have one, you can create a dedicated +'notebooks' token with 'write' access, that you can then easily reuse for all +notebooks. """ + + +def notebook_login(*, skip_if_logged_in: bool = False) -> None: + """ + Displays a widget to log in to the HF website and store the token. + + This is equivalent to [`login`] without passing a token when run in a notebook. + [`notebook_login`] is useful if you want to force the use of the notebook widget + instead of a prompt in the terminal. + + For more details, see [`login`]. + + Args: + skip_if_logged_in (`bool`, defaults to `False`): + If `True`, do not prompt for token if user is already logged in. + """ + try: + import ipywidgets.widgets as widgets # type: ignore + from IPython.display import display # type: ignore + except ImportError: + raise ImportError( + "The `notebook_login` function can only be used in a notebook (Jupyter or" + " Colab) and you need the `ipywidgets` module: `pip install ipywidgets`." + ) + if not skip_if_logged_in and get_token() is not None: + logger.info("User is already logged in.") + return + + box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%") + + token_widget = widgets.Password(description="Token:") + git_checkbox_widget = widgets.Checkbox(value=True, description="Add token as git credential?") + token_finish_button = widgets.Button(description="Login") + + login_token_widget = widgets.VBox( + [ + widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_START), + token_widget, + git_checkbox_widget, + token_finish_button, + widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_END), + ], + layout=box_layout, + ) + display(login_token_widget) + + # On click events + def login_token_event(t): + """Event handler for the login button.""" + token = token_widget.value + add_to_git_credential = git_checkbox_widget.value + # Erase token and clear value to make sure it's not saved in the notebook. + token_widget.value = "" + # Hide inputs + login_token_widget.children = [widgets.Label("Connecting...")] + try: + with capture_output() as captured: + _login(token, add_to_git_credential=add_to_git_credential) + message = captured.getvalue() + except Exception as error: + message = str(error) + # Print result (success message or error) + login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()] + + token_finish_button.on_click(login_token_event) + + +### +# Login private helpers +### + + +def _login( + token: str, + add_to_git_credential: bool, +) -> None: + from .hf_api import whoami # avoid circular import + + if token.startswith("api_org"): + raise ValueError("You must use your personal account token, not an organization token.") + + token_info = whoami(token) + permission = token_info["auth"]["accessToken"]["role"] + logger.info(f"Token is valid (permission: {permission}).") + + token_name = token_info["auth"]["accessToken"]["displayName"] + # Store token locally + _save_token(token=token, token_name=token_name) + # Set active token + _set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential) + logger.info("Login successful.") + if _get_token_from_environment(): + logger.warning( + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured." + ) + else: + logger.info(f"The current active token is: `{token_name}`") + + +def _logout_from_token(token_name: str) -> None: + """Logout from a specific access token. + + Args: + token_name (`str`): + The name of the access token to logout from. + Raises: + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): + If the access token name is not found. + """ + stored_tokens = get_stored_tokens() + # If there is no access tokens saved or the access token name is not found, do nothing + if not stored_tokens or token_name not in stored_tokens: + return + + token = stored_tokens.pop(token_name) + _save_stored_tokens(stored_tokens) + + if token == _get_token_from_file(): + logger.warning(f"Active token '{token_name}' has been deleted.") + Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True) + + +def _set_active_token( + token_name: str, + add_to_git_credential: bool, +) -> None: + """Set the active access token. + + Args: + token_name (`str`): + The name of the token to set as active. + """ + token = _get_token_by_name(token_name) + if not token: + raise ValueError(f"Token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}") + if add_to_git_credential: + if _is_git_credential_helper_configured(): + set_git_credential(token) + logger.info( + "Your token has been saved in your configured git credential helpers" + + f" ({','.join(list_credential_helpers())})." + ) + else: + logger.warning("Token has not been saved to git credential helper.") + # Write token to HF_TOKEN_PATH + path = Path(constants.HF_TOKEN_PATH) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(token) + logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}") + + +def _is_git_credential_helper_configured() -> bool: + """Check if a git credential helper is configured. + + Warns user if not the case (except for Google Colab where "store" is set by default + by `huggingface_hub`). + """ + helpers = list_credential_helpers() + if len(helpers) > 0: + return True # Do not warn: at least 1 helper is set + + # Only in Google Colab to avoid the warning message + # See https://github.com/huggingface/huggingface_hub/issues/1043#issuecomment-1247010710 + if is_google_colab(): + _set_store_as_git_credential_helper_globally() + return True # Do not warn: "store" is used by default in Google Colab + + # Otherwise, warn user + print( + ANSI.red( + "Cannot authenticate through git-credential as no helper is defined on your" + " machine.\nYou might have to re-authenticate when pushing to the Hugging" + " Face Hub.\nRun the following command in your terminal in case you want to" + " set the 'store' credential helper as default.\n\ngit config --global" + " credential.helper store\n\nRead" + " https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more" + " details." + ) + ) + return False + + +def _set_store_as_git_credential_helper_globally() -> None: + """Set globally the credential.helper to `store`. + + To be used only in Google Colab as we assume the user doesn't care about the git + credential config. It is the only particular case where we don't want to display the + warning message in [`notebook_login()`]. + + Related: + - https://github.com/huggingface/huggingface_hub/issues/1043 + - https://github.com/huggingface/huggingface_hub/issues/1051 + - https://git-scm.com/docs/git-credential-store + """ + try: + run_subprocess("git config --global credential.helper store") + except subprocess.CalledProcessError as exc: + raise EnvironmentError(exc.stderr) diff --git a/env/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py b/env/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py new file mode 100644 index 0000000000000000000000000000000000000000..e9ae4940016f50814ab41d06d3757bd62f013ba2 --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/_upload_large_folder.py @@ -0,0 +1,741 @@ +# coding=utf-8 +# Copyright 2024-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import enum +import logging +import os +import queue +import shutil +import sys +import threading +import time +import traceback +from datetime import datetime +from pathlib import Path +from threading import Lock +from typing import TYPE_CHECKING, Any, Optional, Union +from urllib.parse import quote + +from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes +from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata +from .constants import DEFAULT_REVISION, REPO_TYPES +from .utils import DEFAULT_IGNORE_PATTERNS, _format_size, filter_repo_objects, tqdm +from .utils._runtime import is_xet_available +from .utils.sha import sha_fileobj + + +if TYPE_CHECKING: + from .hf_api import HfApi + +logger = logging.getLogger(__name__) + +WAITING_TIME_IF_NO_TASKS = 10 # seconds +MAX_NB_FILES_FETCH_UPLOAD_MODE = 100 +COMMIT_SIZE_SCALE: list[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000] + +UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos +UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload + +# Repository limits (from https://huggingface.co/docs/hub/repositories-recommendations) +MAX_FILES_PER_REPO = 100_000 # Recommended maximum number of files per repository +MAX_FILES_PER_FOLDER = 10_000 # Recommended maximum number of files per folder +MAX_FILE_SIZE_GB = 50 # Hard limit for individual file size +RECOMMENDED_FILE_SIZE_GB = 20 # Recommended maximum for individual file size + + +def _validate_upload_limits(paths_list: list[LocalUploadFilePaths]) -> None: + """ + Validate upload against repository limits and warn about potential issues. + + Args: + paths_list: List of file paths to be uploaded + + Warns about: + - Too many files in the repository (>100k) + - Too many entries (files or subdirectories) in a single folder (>10k) + - Files exceeding size limits (>20GB recommended, >50GB hard limit) + """ + logger.info("Running validation checks on files to upload...") + + # Check 1: Total file count + if len(paths_list) > MAX_FILES_PER_REPO: + logger.warning( + f"You are about to upload {len(paths_list):,} files. " + f"This exceeds the recommended limit of {MAX_FILES_PER_REPO:,} files per repository.\n" + f"Consider:\n" + f" - Splitting your data into multiple repositories\n" + f" - Using fewer, larger files (e.g., parquet files)\n" + f" - See: https://huggingface.co/docs/hub/repositories-recommendations" + ) + + # Check 2: Files and subdirectories per folder + # Track immediate children (files and subdirs) for each folder + from collections import defaultdict + + entries_per_folder: dict[str, Any] = defaultdict(lambda: {"files": 0, "subdirs": set()}) + + for paths in paths_list: + path = Path(paths.path_in_repo) + parts = path.parts + + # Count this file in its immediate parent directory + parent = str(path.parent) if str(path.parent) != "." else "." + entries_per_folder[parent]["files"] += 1 + + # Track immediate subdirectories for each parent folder + # Walk through the path components to track parent-child relationships + for i, child in enumerate(parts[:-1]): + parent = "." if i == 0 else "/".join(parts[:i]) + entries_per_folder[parent]["subdirs"].add(child) + + # Check limits for each folder + for folder, data in entries_per_folder.items(): + file_count = data["files"] + subdir_count = len(data["subdirs"]) + total_entries = file_count + subdir_count + + if total_entries > MAX_FILES_PER_FOLDER: + folder_display = "root" if folder == "." else folder + logger.warning( + f"Folder '{folder_display}' contains {total_entries:,} entries " + f"({file_count:,} files and {subdir_count:,} subdirectories). " + f"This exceeds the recommended {MAX_FILES_PER_FOLDER:,} entries per folder.\n" + "Consider reorganising into sub-folders." + ) + + # Check 3: File sizes + large_files = [] + very_large_files = [] + + for paths in paths_list: + size = paths.file_path.stat().st_size + size_gb = size / 1_000_000_000 # Use decimal GB as per Hub limits + + if size_gb > MAX_FILE_SIZE_GB: + very_large_files.append((paths.path_in_repo, size_gb)) + elif size_gb > RECOMMENDED_FILE_SIZE_GB: + large_files.append((paths.path_in_repo, size_gb)) + + # Warn about very large files (>50GB) + if very_large_files: + files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in very_large_files[:5]) + more_str = f"\n ... and {len(very_large_files) - 5} more files" if len(very_large_files) > 5 else "" + logger.warning( + f"Found {len(very_large_files)} files exceeding the {MAX_FILE_SIZE_GB}GB hard limit:\n" + f" - {files_str}{more_str}\n" + f"These files may fail to upload. Consider splitting them into smaller chunks." + ) + + # Warn about large files (>20GB) + if large_files: + files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in large_files[:5]) + more_str = f"\n ... and {len(large_files) - 5} more files" if len(large_files) > 5 else "" + logger.warning( + f"Found {len(large_files)} files larger than {RECOMMENDED_FILE_SIZE_GB}GB (recommended limit):\n" + f" - {files_str}{more_str}\n" + f"Large files may slow down loading and processing." + ) + + logger.info("Validation checks complete.") + + +def upload_large_folder_internal( + api: "HfApi", + repo_id: str, + folder_path: Union[str, Path], + *, + repo_type: str, # Repo type is required! + revision: Optional[str] = None, + private: Optional[bool] = None, + allow_patterns: Optional[Union[list[str], str]] = None, + ignore_patterns: Optional[Union[list[str], str]] = None, + num_workers: Optional[int] = None, + print_report: bool = True, + print_report_every: int = 60, +): + """Upload a large folder to the Hub in the most resilient way possible. + + See [`HfApi.upload_large_folder`] for the full documentation. + """ + # 1. Check args and setup + if repo_type is None: + raise ValueError( + "For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`." + " If you are using the CLI, pass it as `--repo-type=model`." + ) + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}") + if revision is None: + revision = DEFAULT_REVISION + + folder_path = Path(folder_path).expanduser().resolve() + if not folder_path.is_dir(): + raise ValueError(f"Provided path: '{folder_path}' is not a directory") + + if ignore_patterns is None: + ignore_patterns = [] + elif isinstance(ignore_patterns, str): + ignore_patterns = [ignore_patterns] + ignore_patterns += DEFAULT_IGNORE_PATTERNS + + if num_workers is None: + nb_cores = os.cpu_count() or 1 + num_workers = max(nb_cores // 2, 1) # Use at most half of cpu cores + + # 2. Create repo if missing + repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True) + logger.info(f"Repo created: {repo_url}") + repo_id = repo_url.repo_id + # 2.1 Check if xet is enabled to set batch file upload size + upload_batch_size = UPLOAD_BATCH_SIZE_XET if is_xet_available() else UPLOAD_BATCH_SIZE_LFS + + # 3. List files to upload + filtered_paths_list = filter_repo_objects( + (path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()), + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list] + logger.info(f"Found {len(paths_list)} candidate files to upload") + + # Validate upload against repository limits + _validate_upload_limits(paths_list) + + logger.info("Starting upload...") + + # Read metadata for each file + items = [ + (paths, read_upload_metadata(folder_path, paths.path_in_repo)) + for paths in tqdm(paths_list, desc="Recovering from metadata files") + ] + + # 4. Start workers + status = LargeUploadStatus(items, upload_batch_size) + threads = [ + threading.Thread( + target=_worker_job, + kwargs={ + "status": status, + "api": api, + "repo_id": repo_id, + "repo_type": repo_type, + "revision": revision, + }, + ) + for _ in range(num_workers) + ] + + for thread in threads: + thread.start() + + # 5. Print regular reports + if print_report: + print("\n\n" + status.current_report()) + last_report_ts = time.time() + while True: + time.sleep(1) + if time.time() - last_report_ts >= print_report_every: + if print_report: + _print_overwrite(status.current_report()) + last_report_ts = time.time() + if status.is_done(): + logging.info("Is done: exiting main loop") + break + + for thread in threads: + thread.join() + + logger.info(status.current_report()) + logging.info("Upload is complete!") + + +#################### +# Logic to manage workers and synchronize tasks +#################### + + +class WorkerJob(enum.Enum): + SHA256 = enum.auto() + GET_UPLOAD_MODE = enum.auto() + PREUPLOAD_LFS = enum.auto() + COMMIT = enum.auto() + WAIT = enum.auto() # if no tasks are available but we don't want to exit + + +JOB_ITEM_T = tuple[LocalUploadFilePaths, LocalUploadFileMetadata] + + +class LargeUploadStatus: + """Contains information, queues and tasks for a large upload process.""" + + def __init__(self, items: list[JOB_ITEM_T], upload_batch_size: int = 1): + self.items = items + self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue() + self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue() + self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue() + self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue() + self.lock = Lock() + + self.nb_workers_sha256: int = 0 + self.nb_workers_get_upload_mode: int = 0 + self.nb_workers_preupload_lfs: int = 0 + self.upload_batch_size: int = upload_batch_size + self.nb_workers_commit: int = 0 + self.nb_workers_waiting: int = 0 + self.last_commit_attempt: Optional[float] = None + + self._started_at = datetime.now() + self._chunk_idx: int = 1 + self._chunk_lock: Lock = Lock() + + # Setup queues + for item in self.items: + paths, metadata = item + if metadata.sha256 is None: + self.queue_sha256.put(item) + elif metadata.upload_mode is None: + self.queue_get_upload_mode.put(item) + elif metadata.upload_mode == "lfs" and not metadata.is_uploaded: + self.queue_preupload_lfs.put(item) + elif not metadata.is_committed: + self.queue_commit.put(item) + else: + logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)") + + def target_chunk(self) -> int: + with self._chunk_lock: + return COMMIT_SIZE_SCALE[self._chunk_idx] + + def update_chunk(self, success: bool, nb_items: int, duration: float) -> None: + with self._chunk_lock: + if not success: + logger.warning(f"Failed to commit {nb_items} files at once. Will retry with less files in next batch.") + self._chunk_idx -= 1 + elif nb_items >= COMMIT_SIZE_SCALE[self._chunk_idx] and duration < 40: + logger.info(f"Successfully committed {nb_items} at once. Increasing the limit for next batch.") + self._chunk_idx += 1 + + self._chunk_idx = max(0, min(self._chunk_idx, len(COMMIT_SIZE_SCALE) - 1)) + + def current_report(self) -> str: + """Generate a report of the current status of the large upload.""" + nb_hashed = 0 + size_hashed = 0 + nb_preuploaded = 0 + nb_lfs = 0 + nb_lfs_unsure = 0 + size_preuploaded = 0 + nb_committed = 0 + size_committed = 0 + total_size = 0 + ignored_files = 0 + total_files = 0 + + with self.lock: + for _, metadata in self.items: + if metadata.should_ignore: + ignored_files += 1 + continue + total_size += metadata.size + total_files += 1 + if metadata.sha256 is not None: + nb_hashed += 1 + size_hashed += metadata.size + if metadata.upload_mode == "lfs": + nb_lfs += 1 + if metadata.upload_mode is None: + nb_lfs_unsure += 1 + if metadata.is_uploaded: + nb_preuploaded += 1 + size_preuploaded += metadata.size + if metadata.is_committed: + nb_committed += 1 + size_committed += metadata.size + total_size_str = _format_size(total_size) + + now = datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M:%S") + elapsed = now - self._started_at + elapsed_str = str(elapsed).split(".")[0] # remove milliseconds + + message = "\n" + "-" * 10 + message += f" {now_str} ({elapsed_str}) " + message += "-" * 10 + "\n" + + message += "Files: " + message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | " + message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})" + if nb_lfs_unsure > 0: + message += f" (+{nb_lfs_unsure} unsure)" + message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})" + message += f" | ignored: {ignored_files}\n" + + message += "Workers: " + message += f"hashing: {self.nb_workers_sha256} | " + message += f"get upload mode: {self.nb_workers_get_upload_mode} | " + message += f"pre-uploading: {self.nb_workers_preupload_lfs} | " + message += f"committing: {self.nb_workers_commit} | " + message += f"waiting: {self.nb_workers_waiting}\n" + message += "-" * 51 + + return message + + def is_done(self) -> bool: + with self.lock: + return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items) + + +def _worker_job( + status: LargeUploadStatus, + api: "HfApi", + repo_id: str, + repo_type: str, + revision: str, +): + """ + Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded + and committed. If no tasks are available, the worker will wait for 10 seconds before checking again. + + If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up. + + Read `upload_large_folder` docstring for more information on how tasks are prioritized. + """ + while True: + next_job: Optional[tuple[WorkerJob, list[JOB_ITEM_T]]] = None + + # Determine next task + next_job = _determine_next_job(status) + if next_job is None: + return + job, items = next_job + + # Perform task + if job == WorkerJob.SHA256: + item = items[0] # single item + try: + _compute_sha256(item) + status.queue_get_upload_mode.put(item) + except KeyboardInterrupt: + raise + except Exception as e: + logger.error(f"Failed to compute sha256: {e}") + traceback.format_exc() + status.queue_sha256.put(item) + + with status.lock: + status.nb_workers_sha256 -= 1 + + elif job == WorkerJob.GET_UPLOAD_MODE: + try: + _get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision) + except KeyboardInterrupt: + raise + except Exception as e: + logger.error(f"Failed to get upload mode: {e}") + traceback.format_exc() + + # Items are either: + # - dropped (if should_ignore) + # - put in LFS queue (if LFS) + # - put in commit queue (if regular) + # - or put back (if error occurred). + for item in items: + _, metadata = item + if metadata.should_ignore: + continue + if metadata.upload_mode == "lfs": + status.queue_preupload_lfs.put(item) + elif metadata.upload_mode == "regular": + status.queue_commit.put(item) + else: + status.queue_get_upload_mode.put(item) + + with status.lock: + status.nb_workers_get_upload_mode -= 1 + + elif job == WorkerJob.PREUPLOAD_LFS: + try: + _preupload_lfs(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision) + for item in items: + status.queue_commit.put(item) + except KeyboardInterrupt: + raise + except Exception as e: + logger.error(f"Failed to preupload LFS: {e}") + traceback.format_exc() + for item in items: + status.queue_preupload_lfs.put(item) + + with status.lock: + status.nb_workers_preupload_lfs -= 1 + + elif job == WorkerJob.COMMIT: + start_ts = time.time() + success = True + try: + _commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision) + except KeyboardInterrupt: + raise + except Exception as e: + logger.error(f"Failed to commit: {e}") + traceback.format_exc() + for item in items: + status.queue_commit.put(item) + success = False + duration = time.time() - start_ts + status.update_chunk(success, len(items), duration) + with status.lock: + status.last_commit_attempt = time.time() + status.nb_workers_commit -= 1 + + elif job == WorkerJob.WAIT: + time.sleep(WAITING_TIME_IF_NO_TASKS) + with status.lock: + status.nb_workers_waiting -= 1 + + +def _determine_next_job(status: LargeUploadStatus) -> Optional[tuple[WorkerJob, list[JOB_ITEM_T]]]: + with status.lock: + # 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file) + if ( + status.nb_workers_commit == 0 + and status.queue_commit.qsize() > 0 + and status.last_commit_attempt is not None + and time.time() - status.last_commit_attempt > 5 * 60 + ): + status.nb_workers_commit += 1 + logger.debug("Job: commit (more than 5 minutes since last commit attempt)") + return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk())) + + # 2. Commit if at least 100 files are ready to commit + elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150: + status.nb_workers_commit += 1 + logger.debug("Job: commit (>100 files ready)") + return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk())) + + # 3. Get upload mode if at least 100 files + elif status.queue_get_upload_mode.qsize() >= MAX_NB_FILES_FETCH_UPLOAD_MODE: + status.nb_workers_get_upload_mode += 1 + logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)") + return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE)) + + # 4. Preupload LFS file if at least `status.upload_batch_size` files and no worker is preuploading LFS + elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and status.nb_workers_preupload_lfs == 0: + status.nb_workers_preupload_lfs += 1 + logger.debug("Job: preupload LFS (no other worker preuploading LFS)") + return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size)) + + # 5. Compute sha256 if at least 1 file and no worker is computing sha256 + elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0: + status.nb_workers_sha256 += 1 + logger.debug("Job: sha256 (no other worker computing sha256)") + return (WorkerJob.SHA256, _get_one(status.queue_sha256)) + + # 6. Get upload mode if at least 1 file and no worker is getting upload mode + elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0: + status.nb_workers_get_upload_mode += 1 + logger.debug("Job: get upload mode (no other worker getting upload mode)") + return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE)) + + # 7. Preupload LFS file if at least `status.upload_batch_size` files + elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size: + status.nb_workers_preupload_lfs += 1 + logger.debug("Job: preupload LFS") + return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size)) + + # 8. Compute sha256 if at least 1 file + elif status.queue_sha256.qsize() > 0: + status.nb_workers_sha256 += 1 + logger.debug("Job: sha256") + return (WorkerJob.SHA256, _get_one(status.queue_sha256)) + + # 9. Get upload mode if at least 1 file + elif status.queue_get_upload_mode.qsize() > 0: + status.nb_workers_get_upload_mode += 1 + logger.debug("Job: get upload mode") + return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE)) + + # 10. Preupload LFS file if at least 1 file + elif status.queue_preupload_lfs.qsize() > 0: + status.nb_workers_preupload_lfs += 1 + logger.debug("Job: preupload LFS") + return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size)) + + # 11. Commit if at least 1 file and 1 min since last commit attempt + elif ( + status.nb_workers_commit == 0 + and status.queue_commit.qsize() > 0 + and status.last_commit_attempt is not None + and time.time() - status.last_commit_attempt > 1 * 60 + ): + status.nb_workers_commit += 1 + logger.debug("Job: commit (1 min since last commit attempt)") + return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk())) + + # 12. Commit if at least 1 file all other queues are empty and all workers are waiting + # e.g. when it's the last commit + elif ( + status.nb_workers_commit == 0 + and status.queue_commit.qsize() > 0 + and status.queue_sha256.qsize() == 0 + and status.queue_get_upload_mode.qsize() == 0 + and status.queue_preupload_lfs.qsize() == 0 + and status.nb_workers_sha256 == 0 + and status.nb_workers_get_upload_mode == 0 + and status.nb_workers_preupload_lfs == 0 + ): + status.nb_workers_commit += 1 + logger.debug("Job: commit") + return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk())) + + # 13. If all queues are empty, exit + elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items): + logger.info("All files have been processed! Exiting worker.") + return None + + # 14. If no task is available, wait + else: + status.nb_workers_waiting += 1 + logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)") + return (WorkerJob.WAIT, []) + + +#################### +# Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit) +#################### + + +def _compute_sha256(item: JOB_ITEM_T) -> None: + """Compute sha256 of a file and save it in metadata.""" + paths, metadata = item + if metadata.sha256 is None: + with paths.file_path.open("rb") as f: + metadata.sha256 = sha_fileobj(f).hex() + metadata.save(paths) + + +def _get_upload_mode(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None: + """Get upload mode for each file and update metadata. + + Also receive info if the file should be ignored. + """ + additions = [_build_hacky_operation(item) for item in items] + _fetch_upload_modes( + additions=additions, + repo_type=repo_type, + repo_id=repo_id, + headers=api._build_hf_headers(), + revision=quote(revision, safe=""), + endpoint=api.endpoint, + ) + for item, addition in zip(items, additions): + paths, metadata = item + metadata.upload_mode = addition._upload_mode + metadata.should_ignore = addition._should_ignore + metadata.remote_oid = addition._remote_oid + metadata.save(paths) + + +def _preupload_lfs(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None: + """Preupload LFS files and update metadata.""" + additions = [_build_hacky_operation(item) for item in items] + api.preupload_lfs_files( + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + additions=additions, + ) + + for paths, metadata in items: + metadata.is_uploaded = True + metadata.save(paths) + + +def _commit(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None: + """Commit files to the repo.""" + additions = [_build_hacky_operation(item) for item in items] + api.create_commit( + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + operations=additions, + commit_message="Add files using upload-large-folder tool", + ) + for paths, metadata in items: + metadata.is_committed = True + metadata.save(paths) + + +#################### +# Hacks with CommitOperationAdd to bypass checks/sha256 calculation +#################### + + +class HackyCommitOperationAdd(CommitOperationAdd): + def __post_init__(self) -> None: + if isinstance(self.path_or_fileobj, Path): + self.path_or_fileobj = str(self.path_or_fileobj) + + +def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd: + paths, metadata = item + operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path) + with paths.file_path.open("rb") as file: + sample = file.peek(512)[:512] + if metadata.sha256 is None: + raise ValueError("sha256 must have been computed by now!") + operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample) + operation._upload_mode = metadata.upload_mode # type: ignore[assignment] + operation._should_ignore = metadata.should_ignore + operation._remote_oid = metadata.remote_oid + return operation + + +#################### +# Misc helpers +#################### + + +def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> list[JOB_ITEM_T]: + return [queue.get()] + + +def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> list[JOB_ITEM_T]: + return [queue.get() for _ in range(min(queue.qsize(), n))] + + +def _print_overwrite(report: str) -> None: + """Print a report, overwriting the previous lines. + + Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout` + to print the report. + + Note: works well only if no other process is writing to `sys.stdout`! + """ + report += "\n" + # Get terminal width + terminal_width = shutil.get_terminal_size().columns + + # Count number of lines that should be cleared + nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines()) + + # Clear previous lines based on the number of lines in the report + for _ in range(nb_lines): + sys.stdout.write("\r\033[K") # Clear line + sys.stdout.write("\033[F") # Move cursor up one line + + # Print the new report, filling remaining space with whitespace + sys.stdout.write(report) + sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1]))) + sys.stdout.flush() diff --git a/env/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py b/env/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py new file mode 100644 index 0000000000000000000000000000000000000000..601a55c3d2801964d4fb2172e18e7149d33cc0b2 --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/_webhooks_server.py @@ -0,0 +1,376 @@ +# coding=utf-8 +# Copyright 2023-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains `WebhooksServer` and `webhook_endpoint` to create a webhook server easily.""" + +import atexit +import inspect +import os +from functools import wraps +from typing import TYPE_CHECKING, Any, Callable, Optional + +from .utils import experimental, is_fastapi_available, is_gradio_available + + +if TYPE_CHECKING: + import gradio as gr + from fastapi import Request + +if is_fastapi_available(): + from fastapi import FastAPI, Request + from fastapi.responses import JSONResponse +else: + # Will fail at runtime if FastAPI is not available + FastAPI = Request = JSONResponse = None # type: ignore + + +_global_app: Optional["WebhooksServer"] = None +_is_local = os.environ.get("SPACE_ID") is None + + +@experimental +class WebhooksServer: + """ + The [`WebhooksServer`] class lets you create an instance of a Gradio app that can receive Huggingface webhooks. + These webhooks can be registered using the [`~WebhooksServer.add_webhook`] decorator. Webhook endpoints are added to + the app as a POST endpoint to the FastAPI router. Once all the webhooks are registered, the `launch` method has to be + called to start the app. + + It is recommended to accept [`WebhookPayload`] as the first argument of the webhook function. It is a Pydantic + model that contains all the information about the webhook event. The data will be parsed automatically for you. + + Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to set up your + WebhooksServer and deploy it on a Space. + + > [!WARNING] + > `WebhooksServer` is experimental. Its API is subject to change in the future. + + > [!WARNING] + > You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`). + + Args: + ui (`gradio.Blocks`, optional): + A Gradio UI instance to be used as the Space landing page. If `None`, a UI displaying instructions + about the configured webhooks is created. + webhook_secret (`str`, optional): + A secret key to verify incoming webhook requests. You can set this value to any secret you want as long as + you also configure it in your [webhooks settings panel](https://huggingface.co/settings/webhooks). You + can also set this value as the `WEBHOOK_SECRET` environment variable. If no secret is provided, the + webhook endpoints are opened without any security. + + Example: + + ```python + import gradio as gr + from huggingface_hub import WebhooksServer, WebhookPayload + + with gr.Blocks() as ui: + ... + + app = WebhooksServer(ui=ui, webhook_secret="my_secret_key") + + @app.add_webhook("/say_hello") + async def hello(payload: WebhookPayload): + return {"message": "hello"} + + app.launch() + ``` + """ + + def __new__(cls, *args, **kwargs) -> "WebhooksServer": + if not is_gradio_available(): + raise ImportError( + "You must have `gradio` installed to use `WebhooksServer`. Please run `pip install --upgrade gradio`" + " first." + ) + if not is_fastapi_available(): + raise ImportError( + "You must have `fastapi` installed to use `WebhooksServer`. Please run `pip install --upgrade fastapi`" + " first." + ) + return super().__new__(cls) + + def __init__( + self, + ui: Optional["gr.Blocks"] = None, + webhook_secret: Optional[str] = None, + ) -> None: + self._ui = ui + + self.webhook_secret = webhook_secret or os.getenv("WEBHOOK_SECRET") + self.registered_webhooks: dict[str, Callable] = {} + _warn_on_empty_secret(self.webhook_secret) + + def add_webhook(self, path: Optional[str] = None) -> Callable: + """ + Decorator to add a webhook to the [`WebhooksServer`] server. + + Args: + path (`str`, optional): + The URL path to register the webhook function. If not provided, the function name will be used as the + path. In any case, all webhooks are registered under `/webhooks`. + + Raises: + ValueError: If the provided path is already registered as a webhook. + + Example: + ```python + from huggingface_hub import WebhooksServer, WebhookPayload + + app = WebhooksServer() + + @app.add_webhook + async def trigger_training(payload: WebhookPayload): + if payload.repo.type == "dataset" and payload.event.action == "update": + # Trigger a training job if a dataset is updated + ... + + app.launch() + ``` + """ + # Usage: directly as decorator. Example: `@app.add_webhook` + if callable(path): + # If path is a function, it means it was used as a decorator without arguments + return self.add_webhook()(path) + + # Usage: provide a path. Example: `@app.add_webhook(...)` + @wraps(FastAPI.post) + def _inner_post(*args, **kwargs): + func = args[0] + abs_path = f"/webhooks/{(path or func.__name__).strip('/')}" + if abs_path in self.registered_webhooks: + raise ValueError(f"Webhook {abs_path} already exists.") + self.registered_webhooks[abs_path] = func + + return _inner_post + + def launch(self, prevent_thread_lock: bool = False, **launch_kwargs: Any) -> None: + """Launch the Gradio app and register webhooks to the underlying FastAPI server. + + Input parameters are forwarded to Gradio when launching the app. + """ + ui = self._ui or self._get_default_ui() + + # Start Gradio App + # - as non-blocking so that webhooks can be added afterwards + # - as shared if launch locally (to debug webhooks) + launch_kwargs.setdefault("share", _is_local) + self.fastapi_app, _, _ = ui.launch(prevent_thread_lock=True, **launch_kwargs) + + # Register webhooks to FastAPI app + for path, func in self.registered_webhooks.items(): + # Add secret check if required + if self.webhook_secret is not None: + func = _wrap_webhook_to_check_secret(func, webhook_secret=self.webhook_secret) + + # Add route to FastAPI app + self.fastapi_app.post(path)(func) + + # Print instructions and block main thread + space_host = os.environ.get("SPACE_HOST") + url = "https://" + space_host if space_host is not None else (ui.share_url or ui.local_url) + if url is None: + raise ValueError("Cannot find the URL of the app. Please provide a valid `ui` or update `gradio` version.") + url = url.strip("/") + message = "\nWebhooks are correctly setup and ready to use:" + message += "\n" + "\n".join(f" - POST {url}{webhook}" for webhook in self.registered_webhooks) + message += "\nGo to https://huggingface.co/settings/webhooks to setup your webhooks." + print(message) + + if not prevent_thread_lock: + ui.block_thread() + + def _get_default_ui(self) -> "gr.Blocks": + """Default UI if not provided (lists webhooks and provides basic instructions).""" + import gradio as gr + + with gr.Blocks() as ui: + gr.Markdown("# This is an app to process 🤗 Webhooks") + gr.Markdown( + "Webhooks are a foundation for MLOps-related features. They allow you to listen for new changes on" + " specific repos or to all repos belonging to particular set of users/organizations (not just your" + " repos, but any repo). Check out this [guide](https://huggingface.co/docs/hub/webhooks) to get to" + " know more about webhooks on the Huggingface Hub." + ) + gr.Markdown( + f"{len(self.registered_webhooks)} webhook(s) are registered:" + + "\n\n" + + "\n ".join( + f"- [{webhook_path}]({_get_webhook_doc_url(webhook.__name__, webhook_path)})" + for webhook_path, webhook in self.registered_webhooks.items() + ) + ) + gr.Markdown( + "Go to https://huggingface.co/settings/webhooks to setup your webhooks." + + "\nYou app is running locally. Please look at the logs to check the full URL you need to set." + if _is_local + else ( + "\nThis app is running on a Space. You can find the corresponding URL in the options menu" + " (top-right) > 'Embed the Space'. The URL looks like 'https://{username}-{repo_name}.hf.space'." + ) + ) + return ui + + +@experimental +def webhook_endpoint(path: Optional[str] = None) -> Callable: + """Decorator to start a [`WebhooksServer`] and register the decorated function as a webhook endpoint. + + This is a helper to get started quickly. If you need more flexibility (custom landing page or webhook secret), + you can use [`WebhooksServer`] directly. You can register multiple webhook endpoints (to the same server) by using + this decorator multiple times. + + Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to set up your + server and deploy it on a Space. + + > [!WARNING] + > `webhook_endpoint` is experimental. Its API is subject to change in the future. + + > [!WARNING] + > You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`). + + Args: + path (`str`, optional): + The URL path to register the webhook function. If not provided, the function name will be used as the path. + In any case, all webhooks are registered under `/webhooks`. + + Examples: + The default usage is to register a function as a webhook endpoint. The function name will be used as the path. + The server will be started automatically at exit (i.e. at the end of the script). + + ```python + from huggingface_hub import webhook_endpoint, WebhookPayload + + @webhook_endpoint + async def trigger_training(payload: WebhookPayload): + if payload.repo.type == "dataset" and payload.event.action == "update": + # Trigger a training job if a dataset is updated + ... + + # Server is automatically started at the end of the script. + ``` + + Advanced usage: register a function as a webhook endpoint and start the server manually. This is useful if you + are running it in a notebook. + + ```python + from huggingface_hub import webhook_endpoint, WebhookPayload + + @webhook_endpoint + async def trigger_training(payload: WebhookPayload): + if payload.repo.type == "dataset" and payload.event.action == "update": + # Trigger a training job if a dataset is updated + ... + + # Start the server manually + trigger_training.launch() + ``` + """ + if callable(path): + # If path is a function, it means it was used as a decorator without arguments + return webhook_endpoint()(path) + + @wraps(WebhooksServer.add_webhook) + def _inner(func: Callable) -> Callable: + app = _get_global_app() + app.add_webhook(path)(func) + if len(app.registered_webhooks) == 1: + # Register `app.launch` to run at exit (only once) + atexit.register(app.launch) + + @wraps(app.launch) + def _launch_now(): + # Run the app directly (without waiting atexit) + atexit.unregister(app.launch) + app.launch() + + func.launch = _launch_now # type: ignore + return func + + return _inner + + +def _get_global_app() -> WebhooksServer: + global _global_app + if _global_app is None: + _global_app = WebhooksServer() + return _global_app + + +def _warn_on_empty_secret(webhook_secret: Optional[str]) -> None: + if webhook_secret is None: + print("Webhook secret is not defined. This means your webhook endpoints will be open to everyone.") + print( + "To add a secret, set `WEBHOOK_SECRET` as environment variable or pass it at initialization: " + "\n\t`app = WebhooksServer(webhook_secret='my_secret', ...)`" + ) + print( + "For more details about webhook secrets, please refer to" + " https://huggingface.co/docs/hub/webhooks#webhook-secret." + ) + else: + print("Webhook secret is correctly defined.") + + +def _get_webhook_doc_url(webhook_name: str, webhook_path: str) -> str: + """Returns the anchor to a given webhook in the docs (experimental)""" + return "/docs#/default/" + webhook_name + webhook_path.replace("/", "_") + "_post" + + +def _wrap_webhook_to_check_secret(func: Callable, webhook_secret: str) -> Callable: + """Wraps a webhook function to check the webhook secret before calling the function. + + This is a hacky way to add the `request` parameter to the function signature. Since FastAPI based itself on route + parameters to inject the values to the function, we need to hack the function signature to retrieve the `Request` + object (and hence the headers). A far cleaner solution would be to use a middleware. However, since + `fastapi==0.90.1`, a middleware cannot be added once the app has started. And since the FastAPI app is started by + Gradio internals (and not by us), we cannot add a middleware. + + This method is called only when a secret has been defined by the user. If a request is sent without the + "x-webhook-secret", the function will return a 401 error (unauthorized). If the header is sent but is incorrect, + the function will return a 403 error (forbidden). + + Inspired by https://stackoverflow.com/a/33112180. + """ + initial_sig = inspect.signature(func) + + @wraps(func) + async def _protected_func(request: Request, **kwargs): + request_secret = request.headers.get("x-webhook-secret") + if request_secret is None: + return JSONResponse({"error": "x-webhook-secret header not set."}, status_code=401) + if request_secret != webhook_secret: + return JSONResponse({"error": "Invalid webhook secret."}, status_code=403) + + # Inject `request` in kwargs if required + if "request" in initial_sig.parameters: + kwargs["request"] = request + + # Handle both sync and async routes + if inspect.iscoroutinefunction(func): + return await func(**kwargs) + else: + return func(**kwargs) + + # Update signature to include request + if "request" not in initial_sig.parameters: + _protected_func.__signature__ = initial_sig.replace( # type: ignore + parameters=( + inspect.Parameter(name="request", kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Request), + ) + + tuple(initial_sig.parameters.values()) + ) + + # Return protected route + return _protected_func diff --git a/env/lib/python3.13/site-packages/huggingface_hub/dataclasses.py b/env/lib/python3.13/site-packages/huggingface_hub/dataclasses.py new file mode 100644 index 0000000000000000000000000000000000000000..f801993f0eb4b37449e76718564529ce47ee8201 --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/dataclasses.py @@ -0,0 +1,609 @@ +import inspect +from dataclasses import _MISSING_TYPE, MISSING, Field, field, fields, make_dataclass +from functools import lru_cache, wraps +from typing import ( + Annotated, + Any, + Callable, + ForwardRef, + Literal, + Optional, + Type, + TypeVar, + Union, + get_args, + get_origin, + overload, +) + + +try: + # Python 3.11+ + from typing import NotRequired, Required # type: ignore +except ImportError: + try: + # In case typing_extensions is installed + from typing_extensions import NotRequired, Required # type: ignore + except ImportError: + # Fallback: create dummy types that will never match + Required = type("Required", (), {}) # type: ignore + NotRequired = type("NotRequired", (), {}) # type: ignore + +from .errors import ( + StrictDataclassClassValidationError, + StrictDataclassDefinitionError, + StrictDataclassFieldValidationError, +) + + +Validator_T = Callable[[Any], None] +T = TypeVar("T") +TypedDictType = TypeVar("TypedDictType", bound=dict[str, Any]) + +_TYPED_DICT_DEFAULT_VALUE = object() # used as default value in TypedDict fields (to distinguish from None) + + +# The overload decorator helps type checkers understand the different return types +@overload +def strict(cls: Type[T]) -> Type[T]: ... + + +@overload +def strict(*, accept_kwargs: bool = False) -> Callable[[Type[T]], Type[T]]: ... + + +def strict( + cls: Optional[Type[T]] = None, *, accept_kwargs: bool = False +) -> Union[Type[T], Callable[[Type[T]], Type[T]]]: + """ + Decorator to add strict validation to a dataclass. + + This decorator must be used on top of `@dataclass` to ensure IDEs and static typing tools + recognize the class as a dataclass. + + Can be used with or without arguments: + - `@strict` + - `@strict(accept_kwargs=True)` + + Args: + cls: + The class to convert to a strict dataclass. + accept_kwargs (`bool`, *optional*): + If True, allows arbitrary keyword arguments in `__init__`. Defaults to False. + + Returns: + The enhanced dataclass with strict validation on field assignment. + + Example: + ```py + >>> from dataclasses import dataclass + >>> from huggingface_hub.dataclasses import as_validated_field, strict, validated_field + + >>> @as_validated_field + >>> def positive_int(value: int): + ... if not value >= 0: + ... raise ValueError(f"Value must be positive, got {value}") + + >>> @strict(accept_kwargs=True) + ... @dataclass + ... class User: + ... name: str + ... age: int = positive_int(default=10) + + # Initialize + >>> User(name="John") + User(name='John', age=10) + + # Extra kwargs are accepted + >>> User(name="John", age=30, lastname="Doe") + User(name='John', age=30, *lastname='Doe') + + # Invalid type => raises + >>> User(name="John", age="30") + huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age': + TypeError: Field 'age' expected int, got str (value: '30') + + # Invalid value => raises + >>> User(name="John", age=-1) + huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age': + ValueError: Value must be positive, got -1 + ``` + """ + + def wrap(cls: Type[T]) -> Type[T]: + if not hasattr(cls, "__dataclass_fields__"): + raise StrictDataclassDefinitionError( + f"Class '{cls.__name__}' must be a dataclass before applying @strict." + ) + + # List and store validators + field_validators: dict[str, list[Validator_T]] = {} + for f in fields(cls): # type: ignore [arg-type] + validators = [] + validators.append(_create_type_validator(f)) + custom_validator = f.metadata.get("validator") + if custom_validator is not None: + if not isinstance(custom_validator, list): + custom_validator = [custom_validator] + for validator in custom_validator: + if not _is_validator(validator): + raise StrictDataclassDefinitionError( + f"Invalid validator for field '{f.name}': {validator}. Must be a callable taking a single argument." + ) + validators.extend(custom_validator) + field_validators[f.name] = validators + cls.__validators__ = field_validators # type: ignore + + # Override __setattr__ to validate fields on assignment + original_setattr = cls.__setattr__ + + def __strict_setattr__(self: Any, name: str, value: Any) -> None: + """Custom __setattr__ method for strict dataclasses.""" + # Run all validators + for validator in self.__validators__.get(name, []): + try: + validator(value) + except (ValueError, TypeError) as e: + raise StrictDataclassFieldValidationError(field=name, cause=e) from e + + # If validation passed, set the attribute + original_setattr(self, name, value) + + cls.__setattr__ = __strict_setattr__ # type: ignore[method-assign] + + if accept_kwargs: + # (optional) Override __init__ to accept arbitrary keyword arguments + original_init = cls.__init__ + + @wraps(original_init) + def __init__(self, **kwargs: Any) -> None: + # Extract only the fields that are part of the dataclass + dataclass_fields = {f.name for f in fields(cls)} # type: ignore [arg-type] + standard_kwargs = {k: v for k, v in kwargs.items() if k in dataclass_fields} + + # Call the original __init__ with standard fields + original_init(self, **standard_kwargs) + + # Add any additional kwargs as attributes + for name, value in kwargs.items(): + if name not in dataclass_fields: + self.__setattr__(name, value) + + cls.__init__ = __init__ # type: ignore[method-assign] + + # (optional) Override __repr__ to include additional kwargs + original_repr = cls.__repr__ + + @wraps(original_repr) + def __repr__(self) -> str: + # Call the original __repr__ to get the standard fields + standard_repr = original_repr(self) + + # Get additional kwargs + additional_kwargs = [ + # add a '*' in front of additional kwargs to let the user know they are not part of the dataclass + f"*{k}={v!r}" + for k, v in self.__dict__.items() + if k not in cls.__dataclass_fields__ # type: ignore [attr-defined] + ] + additional_repr = ", ".join(additional_kwargs) + + # Combine both representations + return f"{standard_repr[:-1]}, {additional_repr})" if additional_kwargs else standard_repr + + cls.__repr__ = __repr__ # type: ignore [method-assign] + + # List all public methods starting with `validate_` => class validators. + class_validators = [] + + for name in dir(cls): + if not name.startswith("validate_"): + continue + method = getattr(cls, name) + if not callable(method): + continue + if len(inspect.signature(method).parameters) != 1: + raise StrictDataclassDefinitionError( + f"Class '{cls.__name__}' has a class validator '{name}' that takes more than one argument." + " Class validators must take only 'self' as an argument. Methods starting with 'validate_'" + " are considered to be class validators." + ) + class_validators.append(method) + + cls.__class_validators__ = class_validators # type: ignore [attr-defined] + + # Add `validate` method to the class, but first check if it already exists + def validate(self: T) -> None: + """Run class validators on the instance.""" + for validator in cls.__class_validators__: # type: ignore [attr-defined] + try: + validator(self) + except (ValueError, TypeError) as e: + raise StrictDataclassClassValidationError(validator=validator.__name__, cause=e) from e + + # Hack to be able to raise if `.validate()` already exists except if it was created by this decorator on a parent class + # (in which case we just override it) + validate.__is_defined_by_strict_decorator__ = True # type: ignore [attr-defined] + + if hasattr(cls, "validate"): + if not getattr(cls.validate, "__is_defined_by_strict_decorator__", False): # type: ignore [attr-defined] + raise StrictDataclassDefinitionError( + f"Class '{cls.__name__}' already implements a method called 'validate'." + " This method name is reserved when using the @strict decorator on a dataclass." + " If you want to keep your own method, please rename it." + ) + + cls.validate = validate # type: ignore + + # Run class validators after initialization + initial_init = cls.__init__ + + @wraps(initial_init) + def init_with_validate(self, *args, **kwargs) -> None: + """Run class validators after initialization.""" + initial_init(self, *args, **kwargs) # type: ignore [call-arg] + cls.validate(self) # type: ignore [attr-defined] + + setattr(cls, "__init__", init_with_validate) + + return cls + + # Return wrapped class or the decorator itself + return wrap(cls) if cls is not None else wrap + + +def validate_typed_dict(schema: type[TypedDictType], data: dict) -> None: + """ + Validate that a dictionary conforms to the types defined in a TypedDict class. + + Under the hood, the typed dict is converted to a strict dataclass and validated using the `@strict` decorator. + + Args: + schema (`type[TypedDictType]`): + The TypedDict class defining the expected structure and types. + data (`dict`): + The dictionary to validate. + + Raises: + `StrictDataclassFieldValidationError`: + If any field in the dictionary does not conform to the expected type. + + Example: + ```py + >>> from typing import Annotated, TypedDict + >>> from huggingface_hub.dataclasses import validate_typed_dict + + >>> def positive_int(value: int): + ... if not value >= 0: + ... raise ValueError(f"Value must be positive, got {value}") + + >>> class User(TypedDict): + ... name: str + ... age: Annotated[int, positive_int] + + >>> # Valid data + >>> validate_typed_dict(User, {"name": "John", "age": 30}) + + >>> # Invalid type for age + >>> validate_typed_dict(User, {"name": "John", "age": "30"}) + huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age': + TypeError: Field 'age' expected int, got str (value: '30') + + >>> # Invalid value for age + >>> validate_typed_dict(User, {"name": "John", "age": -1}) + huggingface_hub.errors.StrictDataclassFieldValidationError: Validation error for field 'age': + ValueError: Value must be positive, got -1 + ``` + """ + # Convert typed dict to dataclass + strict_cls = _build_strict_cls_from_typed_dict(schema) + + # Validate the data by instantiating the strict dataclass + strict_cls(**data) # will raise if validation fails + + +@lru_cache +def _build_strict_cls_from_typed_dict(schema: type[TypedDictType]) -> Type: + # Extract type hints from the TypedDict class + type_hints = _get_typed_dict_annotations(schema) + + # If the TypedDict is not total, wrap fields as NotRequired (unless explicitly Required or NotRequired) + if not getattr(schema, "__total__", True): + for key, value in type_hints.items(): + origin = get_origin(value) + + if origin is Annotated: + base, *meta = get_args(value) + if not _is_required_or_notrequired(base): + base = NotRequired[base] + type_hints[key] = Annotated[tuple([base] + list(meta))] # type: ignore + elif not _is_required_or_notrequired(value): + type_hints[key] = NotRequired[value] + + # Convert type hints to dataclass fields + fields = [] + for key, value in type_hints.items(): + if get_origin(value) is Annotated: + base, *meta = get_args(value) + fields.append((key, base, field(default=_TYPED_DICT_DEFAULT_VALUE, metadata={"validator": meta[0]}))) + else: + fields.append((key, value, field(default=_TYPED_DICT_DEFAULT_VALUE))) + + # Create a strict dataclass from the TypedDict fields + return strict(make_dataclass(schema.__name__, fields)) + + +def _get_typed_dict_annotations(schema: type[TypedDictType]) -> dict[str, Any]: + """Extract type annotations from a TypedDict class.""" + try: + # Available in Python 3.14+ + import annotationlib + + return annotationlib.get_annotations(schema) + except ImportError: + return { + # We do not use `get_type_hints` here to avoid evaluating ForwardRefs (which might fail). + # ForwardRefs are not validated by @strict anyway. + name: value if value is not None else type(None) + for name, value in schema.__dict__.get("__annotations__", {}).items() + } + + +def validated_field( + validator: Union[list[Validator_T], Validator_T], + default: Union[Any, _MISSING_TYPE] = MISSING, + default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING, + init: bool = True, + repr: bool = True, + hash: Optional[bool] = None, + compare: bool = True, + metadata: Optional[dict] = None, + **kwargs: Any, +) -> Any: + """ + Create a dataclass field with a custom validator. + + Useful to apply several checks to a field. If only applying one rule, check out the [`as_validated_field`] decorator. + + Args: + validator (`Callable` or `list[Callable]`): + A method that takes a value as input and raises ValueError/TypeError if the value is invalid. + Can be a list of validators to apply multiple checks. + **kwargs: + Additional arguments to pass to `dataclasses.field()`. + + Returns: + A field with the validator attached in metadata + """ + if not isinstance(validator, list): + validator = [validator] + if metadata is None: + metadata = {} + metadata["validator"] = validator + return field( # type: ignore + default=default, # type: ignore [arg-type] + default_factory=default_factory, # type: ignore [arg-type] + init=init, + repr=repr, + hash=hash, + compare=compare, + metadata=metadata, + **kwargs, + ) + + +def as_validated_field(validator: Validator_T): + """ + Decorates a validator function as a [`validated_field`] (i.e. a dataclass field with a custom validator). + + Args: + validator (`Callable`): + A method that takes a value as input and raises ValueError/TypeError if the value is invalid. + """ + + def _inner( + default: Union[Any, _MISSING_TYPE] = MISSING, + default_factory: Union[Callable[[], Any], _MISSING_TYPE] = MISSING, + init: bool = True, + repr: bool = True, + hash: Optional[bool] = None, + compare: bool = True, + metadata: Optional[dict] = None, + **kwargs: Any, + ): + return validated_field( + validator, + default=default, + default_factory=default_factory, + init=init, + repr=repr, + hash=hash, + compare=compare, + metadata=metadata, + **kwargs, + ) + + return _inner + + +def type_validator(name: str, value: Any, expected_type: Any) -> None: + """Validate that 'value' matches 'expected_type'.""" + origin = get_origin(expected_type) + args = get_args(expected_type) + + if expected_type is Any: + return + elif validator := _BASIC_TYPE_VALIDATORS.get(origin): + validator(name, value, args) + elif isinstance(expected_type, type): # simple types + _validate_simple_type(name, value, expected_type) + elif isinstance(expected_type, ForwardRef) or isinstance(expected_type, str): + return + elif origin is Required: + if value is _TYPED_DICT_DEFAULT_VALUE: + raise TypeError(f"Field '{name}' is required but missing.") + type_validator(name, value, args[0]) + elif origin is NotRequired: + if value is _TYPED_DICT_DEFAULT_VALUE: + return + type_validator(name, value, args[0]) + else: + raise TypeError(f"Unsupported type for field '{name}': {expected_type}") + + +def _validate_union(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate that value matches one of the types in a Union.""" + errors = [] + for t in args: + try: + type_validator(name, value, t) + return # Valid if any type matches + except TypeError as e: + errors.append(str(e)) + + raise TypeError( + f"Field '{name}' with value {repr(value)} doesn't match any type in {args}. Errors: {'; '.join(errors)}" + ) + + +def _validate_literal(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate Literal type.""" + if value not in args: + raise TypeError(f"Field '{name}' expected one of {args}, got {value}") + + +def _validate_list(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate list[T] type.""" + if not isinstance(value, list): + raise TypeError(f"Field '{name}' expected a list, got {type(value).__name__}") + + # Validate each item in the list + item_type = args[0] + for i, item in enumerate(value): + try: + type_validator(f"{name}[{i}]", item, item_type) + except TypeError as e: + raise TypeError(f"Invalid item at index {i} in list '{name}'") from e + + +def _validate_dict(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate dict[K, V] type.""" + if not isinstance(value, dict): + raise TypeError(f"Field '{name}' expected a dict, got {type(value).__name__}") + + # Validate keys and values + key_type, value_type = args + for k, v in value.items(): + try: + type_validator(f"{name}.key", k, key_type) + type_validator(f"{name}[{k!r}]", v, value_type) + except TypeError as e: + raise TypeError(f"Invalid key or value in dict '{name}'") from e + + +def _validate_tuple(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate Tuple type.""" + if not isinstance(value, tuple): + raise TypeError(f"Field '{name}' expected a tuple, got {type(value).__name__}") + + # Handle variable-length tuples: tuple[T, ...] + if len(args) == 2 and args[1] is Ellipsis: + for i, item in enumerate(value): + try: + type_validator(f"{name}[{i}]", item, args[0]) + except TypeError as e: + raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e + # Handle fixed-length tuples: tuple[T1, T2, ...] + elif len(args) != len(value): + raise TypeError(f"Field '{name}' expected a tuple of length {len(args)}, got {len(value)}") + else: + for i, (item, expected) in enumerate(zip(value, args)): + try: + type_validator(f"{name}[{i}]", item, expected) + except TypeError as e: + raise TypeError(f"Invalid item at index {i} in tuple '{name}'") from e + + +def _validate_set(name: str, value: Any, args: tuple[Any, ...]) -> None: + """Validate set[T] type.""" + if not isinstance(value, set): + raise TypeError(f"Field '{name}' expected a set, got {type(value).__name__}") + + # Validate each item in the set + item_type = args[0] + for i, item in enumerate(value): + try: + type_validator(f"{name} item", item, item_type) + except TypeError as e: + raise TypeError(f"Invalid item in set '{name}'") from e + + +def _validate_simple_type(name: str, value: Any, expected_type: type) -> None: + """Validate simple type (int, str, etc.).""" + if not isinstance(value, expected_type): + raise TypeError( + f"Field '{name}' expected {expected_type.__name__}, got {type(value).__name__} (value: {repr(value)})" + ) + + +def _create_type_validator(field: Field) -> Validator_T: + """Create a type validator function for a field.""" + # Hacky: we cannot use a lambda here because of reference issues + + def validator(value: Any) -> None: + type_validator(field.name, value, field.type) + + return validator + + +def _is_validator(validator: Any) -> bool: + """Check if a function is a validator. + + A validator is a Callable that can be called with a single positional argument. + The validator can have more arguments with default values. + + Basically, returns True if `validator(value)` is possible. + """ + if not callable(validator): + return False + + signature = inspect.signature(validator) + parameters = list(signature.parameters.values()) + if len(parameters) == 0: + return False + if parameters[0].kind not in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.VAR_POSITIONAL, + ): + return False + for parameter in parameters[1:]: + if parameter.default == inspect.Parameter.empty: + return False + return True + + +def _is_required_or_notrequired(type_hint: Any) -> bool: + """Helper to check if a type is Required/NotRequired.""" + return type_hint in (Required, NotRequired) or (get_origin(type_hint) in (Required, NotRequired)) + + +_BASIC_TYPE_VALIDATORS = { + Union: _validate_union, + Literal: _validate_literal, + list: _validate_list, + dict: _validate_dict, + tuple: _validate_tuple, + set: _validate_set, +} + + +__all__ = [ + "strict", + "validate_typed_dict", + "validated_field", + "Validator_T", + "StrictDataclassClassValidationError", + "StrictDataclassDefinitionError", + "StrictDataclassFieldValidationError", +] diff --git a/env/lib/python3.13/site-packages/huggingface_hub/hf_file_system.py b/env/lib/python3.13/site-packages/huggingface_hub/hf_file_system.py new file mode 100644 index 0000000000000000000000000000000000000000..d81721c8500dc90ec175a784741fe5947475291a --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/hf_file_system.py @@ -0,0 +1,1275 @@ +import os +import re +import tempfile +import threading +from collections import deque +from contextlib import ExitStack +from copy import deepcopy +from dataclasses import dataclass, field +from datetime import datetime +from itertools import chain +from pathlib import Path +from typing import Any, Iterator, NoReturn, Optional, Union +from urllib.parse import quote, unquote + +import fsspec +import httpx +from fsspec.callbacks import _DEFAULT_CALLBACK, NoOpCallback, TqdmCallback +from fsspec.utils import isfilelike + +from . import constants +from ._commit_api import CommitOperationCopy, CommitOperationDelete +from .errors import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError +from .file_download import hf_hub_url, http_get +from .hf_api import HfApi, LastCommitInfo, RepoFile +from .utils import HFValidationError, hf_raise_for_status, http_backoff, http_stream_backoff +from .utils.insecure_hashlib import md5 + + +# Regex used to match special revisions with "/" in them (see #1710) +SPECIAL_REFS_REVISION_REGEX = re.compile( + r""" + (^refs\/convert\/\w+) # `refs/convert/parquet` revisions + | + (^refs\/pr\/\d+) # PR revisions + """, + re.VERBOSE, +) + + +@dataclass +class HfFileSystemResolvedPath: + """Data structure containing information about a resolved Hugging Face file system path.""" + + repo_type: str + repo_id: str + revision: str + path_in_repo: str + # The part placed after '@' in the initial path. It can be a quoted or unquoted refs revision. + # Used to reconstruct the unresolved path to return to the user. + _raw_revision: Optional[str] = field(default=None, repr=False) + + def unresolve(self) -> str: + repo_path = constants.REPO_TYPES_URL_PREFIXES.get(self.repo_type, "") + self.repo_id + if self._raw_revision: + return f"{repo_path}@{self._raw_revision}/{self.path_in_repo}".rstrip("/") + elif self.revision != constants.DEFAULT_REVISION: + return f"{repo_path}@{safe_revision(self.revision)}/{self.path_in_repo}".rstrip("/") + else: + return f"{repo_path}/{self.path_in_repo}".rstrip("/") + + +# We need to improve fsspec.spec._Cached which is AbstractFileSystem's metaclass +_cached_base: Any = type(fsspec.AbstractFileSystem) + + +class _Cached(_cached_base): + """ + Metaclass for caching HfFileSystem instances according to the args. + + This creates an additional reference to the filesystem, which prevents the + filesystem from being garbage collected when all *user* references go away. + A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* + be made for a filesystem instance to be garbage collected. + + This is a slightly modified version of `fsspec.spec._Cached` to improve it. + In particular in `_tokenize` the pid isn't taken into account for the + `fs_token` used to identify cached instances. The `fs_token` logic is also + robust to defaults values and the order of the args. Finally new instances + reuse the states from sister instances in the main thread. + """ + + def __init__(cls, *args, **kwargs): + # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L53 + super().__init__(*args, **kwargs) + # Note: we intentionally create a reference here, to avoid garbage + # collecting instances when all other references are gone. To really + # delete a FileSystem, the cache must be cleared. + cls._cache = {} + + def __call__(cls, *args, **kwargs): + # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L65 + skip = kwargs.pop("skip_instance_cache", False) + fs_token = cls._tokenize(cls, threading.get_ident(), *args, **kwargs) + fs_token_main_thread = cls._tokenize(cls, threading.main_thread().ident, *args, **kwargs) + if not skip and cls.cachable and fs_token in cls._cache: + # reuse cached instance + cls._latest = fs_token + return cls._cache[fs_token] + else: + # create new instance + obj = type.__call__(cls, *args, **kwargs) + if not skip and cls.cachable and fs_token_main_thread in cls._cache: + # reuse the cache from the main thread instance in the new instance + instance_state = cls._cache[fs_token_main_thread]._get_instance_state() + for attr, state_value in instance_state.items(): + setattr(obj, attr, state_value) + obj._fs_token_ = fs_token + obj.storage_args = args + obj.storage_options = kwargs + if cls.cachable and not skip: + cls._latest = fs_token + cls._cache[fs_token] = obj + return obj + + +class HfFileSystem(fsspec.AbstractFileSystem, metaclass=_Cached): + """ + Access a remote Hugging Face Hub repository as if were a local file system. + + > [!WARNING] + > [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading + > Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility + > layer. For better performance and reliability, it's recommended to use `HfApi` methods when possible. + + Args: + endpoint (`str`, *optional*): + Endpoint of the Hub. Defaults to . + token (`bool` or `str`, *optional*): + A valid user access token (string). Defaults to the locally saved + token, which is the recommended method for authentication (see + https://huggingface.co/docs/huggingface_hub/quick-start#authentication). + To disable authentication, pass `False`. + block_size (`int`, *optional*): + Block size for reading and writing files. + expand_info (`bool`, *optional*): + Whether to expand the information of the files. + **storage_options (`dict`, *optional*): + Additional options for the filesystem. See [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.__init__). + + Usage: + + ```python + >>> from huggingface_hub import hffs + + >>> # List files + >>> hffs.glob("my-username/my-model/*.bin") + ['my-username/my-model/pytorch_model.bin'] + >>> hffs.ls("datasets/my-username/my-dataset", detail=False) + ['datasets/my-username/my-dataset/.gitattributes', 'datasets/my-username/my-dataset/README.md', 'datasets/my-username/my-dataset/data.json'] + + >>> # Read/write files + >>> with hffs.open("my-username/my-model/pytorch_model.bin") as f: + ... data = f.read() + >>> with hffs.open("my-username/my-model/pytorch_model.bin", "wb") as f: + ... f.write(data) + ``` + + Specify a token for authentication: + ```python + >>> from huggingface_hub import HfFileSystem + >>> hffs = HfFileSystem(token=token) + ``` + """ + + root_marker = "" + protocol = "hf" + + def __init__( + self, + *args, + endpoint: Optional[str] = None, + token: Union[bool, str, None] = None, + block_size: Optional[int] = None, + expand_info: Optional[bool] = None, + **storage_options, + ): + super().__init__(*args, **storage_options) + self.endpoint = endpoint or constants.ENDPOINT + self.token = token + self._api = HfApi(endpoint=endpoint, token=token) + self.block_size = block_size + self.expand_info = expand_info + # Maps (repo_type, repo_id, revision) to a 2-tuple with: + # * the 1st element indicating whether the repositoy and the revision exist + # * the 2nd element being the exception raised if the repository or revision doesn't exist + self._repo_and_revision_exists_cache: dict[ + tuple[str, str, Optional[str]], tuple[bool, Optional[Exception]] + ] = {} + # Maps parent directory path to path infos + self.dircache: dict[str, list[dict[str, Any]]] = {} + + @classmethod + def _tokenize(cls, threading_ident: int, *args, **kwargs) -> str: + """Deterministic token for caching""" + # make fs_token robust to default values and to kwargs order + kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT + kwargs["token"] = kwargs.get("token") + kwargs = {key: kwargs[key] for key in sorted(kwargs)} + # contrary to fsspec, we don't include pid here + tokenize_args = (cls, threading_ident, args, kwargs) + h = md5(str(tokenize_args).encode()) + return h.hexdigest() + + def _repo_and_revision_exist( + self, repo_type: str, repo_id: str, revision: Optional[str] + ) -> tuple[bool, Optional[Exception]]: + if (repo_type, repo_id, revision) not in self._repo_and_revision_exists_cache: + try: + self._api.repo_info( + repo_id, revision=revision, repo_type=repo_type, timeout=constants.HF_HUB_ETAG_TIMEOUT + ) + except (RepositoryNotFoundError, HFValidationError) as e: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = False, e + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = False, e + except RevisionNotFoundError as e: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = False, e + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = True, None + else: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = True, None + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = True, None + return self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] + + def resolve_path(self, path: str, revision: Optional[str] = None) -> HfFileSystemResolvedPath: + """ + Resolve a Hugging Face file system path into its components. + + Args: + path (`str`): + Path to resolve. + revision (`str`, *optional*): + The revision of the repo to resolve. Defaults to the revision specified in the path. + + Returns: + [`HfFileSystemResolvedPath`]: Resolved path information containing `repo_type`, `repo_id`, `revision` and `path_in_repo`. + + Raises: + `ValueError`: + If path contains conflicting revision information. + `NotImplementedError`: + If trying to list repositories. + """ + + def _align_revision_in_path_with_revision( + revision_in_path: Optional[str], revision: Optional[str] + ) -> Optional[str]: + if revision is not None: + if revision_in_path is not None and revision_in_path != revision: + raise ValueError( + f'Revision specified in path ("{revision_in_path}") and in `revision` argument ("{revision}")' + " are not the same." + ) + else: + revision = revision_in_path + return revision + + path = self._strip_protocol(path) + if not path: + # can't list repositories at root + raise NotImplementedError("Access to repositories lists is not implemented.") + elif path.split("/")[0] + "/" in constants.REPO_TYPES_URL_PREFIXES.values(): + if "/" not in path: + # can't list repositories at the repository type level + raise NotImplementedError("Access to repositories lists is not implemented.") + repo_type, path = path.split("/", 1) + repo_type = constants.REPO_TYPES_MAPPING[repo_type] + else: + repo_type = constants.REPO_TYPE_MODEL + if path.count("/") > 0: + if "@" in path: + repo_id, revision_in_path = path.split("@", 1) + if "/" in revision_in_path: + match = SPECIAL_REFS_REVISION_REGEX.search(revision_in_path) + if match is not None and revision in (None, match.group()): + # Handle `refs/convert/parquet` and PR revisions separately + path_in_repo = SPECIAL_REFS_REVISION_REGEX.sub("", revision_in_path).lstrip("/") + revision_in_path = match.group() + else: + revision_in_path, path_in_repo = revision_in_path.split("/", 1) + else: + path_in_repo = "" + revision = _align_revision_in_path_with_revision(unquote(revision_in_path), revision) + repo_and_revision_exist, err = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + _raise_file_not_found(path, err) + else: + revision_in_path = None + repo_id_with_namespace = "/".join(path.split("/")[:2]) + path_in_repo_with_namespace = "/".join(path.split("/")[2:]) + repo_id_without_namespace = path.split("/")[0] + path_in_repo_without_namespace = "/".join(path.split("/")[1:]) + repo_id = repo_id_with_namespace + path_in_repo = path_in_repo_with_namespace + repo_and_revision_exist, err = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + if isinstance(err, (RepositoryNotFoundError, HFValidationError)): + repo_id = repo_id_without_namespace + path_in_repo = path_in_repo_without_namespace + repo_and_revision_exist, _ = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + _raise_file_not_found(path, err) + else: + _raise_file_not_found(path, err) + else: + repo_id = path + path_in_repo = "" + if "@" in path: + repo_id, revision_in_path = path.split("@", 1) + revision = _align_revision_in_path_with_revision(unquote(revision_in_path), revision) + else: + revision_in_path = None + repo_and_revision_exist, _ = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + raise NotImplementedError("Access to repositories lists is not implemented.") + + revision = revision if revision is not None else constants.DEFAULT_REVISION + return HfFileSystemResolvedPath(repo_type, repo_id, revision, path_in_repo, _raw_revision=revision_in_path) + + def invalidate_cache(self, path: Optional[str] = None) -> None: + """ + Clear the cache for a given path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.invalidate_cache). + + Args: + path (`str`, *optional*): + Path to clear from cache. If not provided, clear the entire cache. + + """ + if not path: + self.dircache.clear() + self._repo_and_revision_exists_cache.clear() + else: + resolved_path = self.resolve_path(path) + path = resolved_path.unresolve() + while path: + self.dircache.pop(path, None) + path = self._parent(path) + + # Only clear repo cache if path is to repo root + if not resolved_path.path_in_repo: + self._repo_and_revision_exists_cache.pop((resolved_path.repo_type, resolved_path.repo_id, None), None) + self._repo_and_revision_exists_cache.pop( + (resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision), None + ) + + def _open( # type: ignore[override] + self, + path: str, + mode: str = "rb", + block_size: Optional[int] = None, + revision: Optional[str] = None, + **kwargs, + ) -> Union["HfFileSystemFile", "HfFileSystemStreamFile"]: + block_size = block_size if block_size is not None else self.block_size + if block_size is not None: + kwargs["block_size"] = block_size + if "a" in mode: + raise NotImplementedError("Appending to remote files is not yet supported.") + if block_size == 0: + return HfFileSystemStreamFile(self, path, mode=mode, revision=revision, **kwargs) + else: + return HfFileSystemFile(self, path, mode=mode, revision=revision, **kwargs) + + def _rm(self, path: str, revision: Optional[str] = None, **kwargs) -> None: + resolved_path = self.resolve_path(path, revision=revision) + self._api.delete_file( + path_in_repo=resolved_path.path_in_repo, + repo_id=resolved_path.repo_id, + token=self.token, + repo_type=resolved_path.repo_type, + revision=resolved_path.revision, + commit_message=kwargs.get("commit_message"), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path.unresolve()) + + def rm( + self, + path: str, + recursive: bool = False, + maxdepth: Optional[int] = None, + revision: Optional[str] = None, + **kwargs, + ) -> None: + """ + Delete files from a repository. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.rm). + + > [!WARNING] + > Note: When possible, use `HfApi.delete_file()` for better performance. + + Args: + path (`str`): + Path to delete. + recursive (`bool`, *optional*): + If True, delete directory and all its contents. Defaults to False. + maxdepth (`int`, *optional*): + Maximum number of subdirectories to visit when deleting recursively. + revision (`str`, *optional*): + The git revision to delete from. + + """ + resolved_path = self.resolve_path(path, revision=revision) + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth, revision=revision) + paths_in_repo = [self.resolve_path(path).path_in_repo for path in paths if not self.isdir(path)] + operations = [CommitOperationDelete(path_in_repo=path_in_repo) for path_in_repo in paths_in_repo] + commit_message = f"Delete {path} " + commit_message += "recursively " if recursive else "" + commit_message += f"up to depth {maxdepth} " if maxdepth is not None else "" + # TODO: use `commit_description` to list all the deleted paths? + self._api.create_commit( + repo_id=resolved_path.repo_id, + repo_type=resolved_path.repo_type, + token=self.token, + operations=operations, + revision=resolved_path.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path.unresolve()) + + def ls( + self, path: str, detail: bool = True, refresh: bool = False, revision: Optional[str] = None, **kwargs + ) -> list[Union[str, dict[str, Any]]]: + """ + List the contents of a directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.ls). + + > [!WARNING] + > Note: When possible, use `HfApi.list_repo_tree()` for better performance. + + Args: + path (`str`): + Path to the directory. + detail (`bool`, *optional*): + If True, returns a list of dictionaries containing file information. If False, + returns a list of file paths. Defaults to True. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to list from. + + Returns: + `list[Union[str, dict[str, Any]]]`: List of file paths (if detail=False) or list of file information + dictionaries (if detail=True). + """ + resolved_path = self.resolve_path(path, revision=revision) + path = resolved_path.unresolve() + try: + out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs) + except EntryNotFoundError: + # Path could be a file + if not resolved_path.path_in_repo: + _raise_file_not_found(path, None) + out = self._ls_tree(self._parent(path), refresh=refresh, revision=revision, **kwargs) + out = [o for o in out if o["name"] == path] + if len(out) == 0: + _raise_file_not_found(path, None) + return out if detail else [o["name"] for o in out] + + def _ls_tree( + self, + path: str, + recursive: bool = False, + refresh: bool = False, + revision: Optional[str] = None, + expand_info: Optional[bool] = None, + maxdepth: Optional[int] = None, + ): + expand_info = ( + expand_info if expand_info is not None else (self.expand_info if self.expand_info is not None else False) + ) + resolved_path = self.resolve_path(path, revision=revision) + path = resolved_path.unresolve() + root_path = HfFileSystemResolvedPath( + resolved_path.repo_type, + resolved_path.repo_id, + resolved_path.revision, + path_in_repo="", + _raw_revision=resolved_path._raw_revision, + ).unresolve() + + out = [] + if path in self.dircache and not refresh: + cached_path_infos = self.dircache[path] + out.extend(cached_path_infos) + dirs_not_in_dircache = [] + if recursive: + # Use BFS to traverse the cache and build the "recursive "output + # (The Hub uses a so-called "tree first" strategy for the tree endpoint but we sort the output to follow the spec so the result is (eventually) the same) + depth = 2 + dirs_to_visit = deque( + [(depth, path_info) for path_info in cached_path_infos if path_info["type"] == "directory"] + ) + while dirs_to_visit: + depth, dir_info = dirs_to_visit.popleft() + if maxdepth is None or depth <= maxdepth: + if dir_info["name"] not in self.dircache: + dirs_not_in_dircache.append(dir_info["name"]) + else: + cached_path_infos = self.dircache[dir_info["name"]] + out.extend(cached_path_infos) + dirs_to_visit.extend( + [ + (depth + 1, path_info) + for path_info in cached_path_infos + if path_info["type"] == "directory" + ] + ) + + dirs_not_expanded = [] + if expand_info: + # Check if there are directories with non-expanded entries + dirs_not_expanded = [self._parent(o["name"]) for o in out if o["last_commit"] is None] + + if (recursive and dirs_not_in_dircache) or (expand_info and dirs_not_expanded): + # If the dircache is incomplete, find the common path of the missing and non-expanded entries + # and extend the output with the result of `_ls_tree(common_path, recursive=True)` + common_prefix = os.path.commonprefix(dirs_not_in_dircache + dirs_not_expanded) + # Get the parent directory if the common prefix itself is not a directory + common_path = ( + common_prefix.rstrip("/") + if common_prefix.endswith("/") + or common_prefix == root_path + or common_prefix in chain(dirs_not_in_dircache, dirs_not_expanded) + else self._parent(common_prefix) + ) + if maxdepth is not None: + common_path_depth = common_path[len(path) :].count("/") + maxdepth -= common_path_depth + out = [o for o in out if not o["name"].startswith(common_path + "/")] + for cached_path in list(self.dircache): + if cached_path.startswith(common_path + "/"): + self.dircache.pop(cached_path, None) + self.dircache.pop(common_path, None) + out.extend( + self._ls_tree( + common_path, + recursive=recursive, + refresh=True, + revision=revision, + expand_info=expand_info, + maxdepth=maxdepth, + ) + ) + else: + tree = self._api.list_repo_tree( + resolved_path.repo_id, + resolved_path.path_in_repo, + recursive=recursive, + expand=expand_info, + revision=resolved_path.revision, + repo_type=resolved_path.repo_type, + ) + for path_info in tree: + cache_path = root_path + "/" + path_info.path + if isinstance(path_info, RepoFile): + cache_path_info = { + "name": cache_path, + "size": path_info.size, + "type": "file", + "blob_id": path_info.blob_id, + "lfs": path_info.lfs, + "last_commit": path_info.last_commit, + "security": path_info.security, + } + else: + cache_path_info = { + "name": cache_path, + "size": 0, + "type": "directory", + "tree_id": path_info.tree_id, + "last_commit": path_info.last_commit, + } + parent_path = self._parent(cache_path_info["name"]) + self.dircache.setdefault(parent_path, []).append(cache_path_info) + depth = cache_path[len(path) :].count("/") + if maxdepth is None or depth <= maxdepth: + out.append(cache_path_info) + return out + + def walk(self, path: str, *args, **kwargs) -> Iterator[tuple[str, list[str], list[str]]]: + """ + Return all files below the given path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.walk). + + Args: + path (`str`): + Root path to list files from. + + Returns: + `Iterator[tuple[str, list[str], list[str]]]`: An iterator of (path, list of directory names, list of file names) tuples. + """ + path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() + yield from super().walk(path, *args, **kwargs) + + def glob(self, path: str, maxdepth: Optional[int] = None, **kwargs) -> list[str]: + """ + Find files by glob-matching. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.glob). + + Args: + path (`str`): + Path pattern to match. + + Returns: + `list[str]`: List of paths matching the pattern. + """ + path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() + return super().glob(path, maxdepth=maxdepth, **kwargs) + + def find( + self, + path: str, + maxdepth: Optional[int] = None, + withdirs: bool = False, + detail: bool = False, + refresh: bool = False, + revision: Optional[str] = None, + **kwargs, + ) -> Union[list[str], dict[str, dict[str, Any]]]: + """ + List all files below path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.find). + + Args: + path (`str`): + Root path to list files from. + maxdepth (`int`, *optional*): + Maximum depth to descend into subdirectories. + withdirs (`bool`, *optional*): + Include directory paths in the output. Defaults to False. + detail (`bool`, *optional*): + If True, returns a dict mapping paths to file information. Defaults to False. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to list from. + + Returns: + `Union[list[str], dict[str, dict[str, Any]]]`: List of paths or dict of file information. + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + resolved_path = self.resolve_path(path, revision=revision) + path = resolved_path.unresolve() + try: + out = self._ls_tree( + path, recursive=True, refresh=refresh, revision=resolved_path.revision, maxdepth=maxdepth, **kwargs + ) + except EntryNotFoundError: + # Path could be a file + try: + if self.info(path, revision=revision, **kwargs)["type"] == "file": + out = {path: {}} + else: + out = {} + except FileNotFoundError: + out = {} + else: + if not withdirs: + out = [o for o in out if o["type"] != "directory"] + else: + # If `withdirs=True`, include the directory itself to be consistent with the spec + path_info = self.info(path, revision=resolved_path.revision, **kwargs) + out = [path_info] + out if path_info["type"] == "directory" else out + out = {o["name"]: o for o in out} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} + + def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwargs) -> None: + """ + Copy a file within or between repositories. + + > [!WARNING] + > Note: When possible, use `HfApi.upload_file()` for better performance. + + Args: + path1 (`str`): + Source path to copy from. + path2 (`str`): + Destination path to copy to. + revision (`str`, *optional*): + The git revision to copy from. + + """ + resolved_path1 = self.resolve_path(path1, revision=revision) + resolved_path2 = self.resolve_path(path2, revision=revision) + + same_repo = ( + resolved_path1.repo_type == resolved_path2.repo_type and resolved_path1.repo_id == resolved_path2.repo_id + ) + + if same_repo: + commit_message = f"Copy {path1} to {path2}" + self._api.create_commit( + repo_id=resolved_path1.repo_id, + repo_type=resolved_path1.repo_type, + revision=resolved_path2.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description", ""), + operations=[ + CommitOperationCopy( + src_path_in_repo=resolved_path1.path_in_repo, + path_in_repo=resolved_path2.path_in_repo, + src_revision=resolved_path1.revision, + ) + ], + ) + else: + with self.open(path1, "rb", revision=resolved_path1.revision) as f: + content = f.read() + commit_message = f"Copy {path1} to {path2}" + self._api.upload_file( + path_or_fileobj=content, + path_in_repo=resolved_path2.path_in_repo, + repo_id=resolved_path2.repo_id, + token=self.token, + repo_type=resolved_path2.repo_type, + revision=resolved_path2.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path1.unresolve()) + self.invalidate_cache(path=resolved_path2.unresolve()) + + def modified(self, path: str, **kwargs) -> datetime: + """ + Get the last modified time of a file. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.modified). + + Args: + path (`str`): + Path to the file. + + Returns: + `datetime`: Last commit date of the file. + """ + info = self.info(path, **{**kwargs, "expand_info": True}) # type: ignore + return info["last_commit"]["date"] + + def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> dict[str, Any]: + """ + Get information about a file or directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.info). + + > [!WARNING] + > Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance. + + Args: + path (`str`): + Path to get info for. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to get info from. + + Returns: + `dict[str, Any]`: Dictionary containing file information (type, size, commit info, etc.). + + """ + resolved_path = self.resolve_path(path, revision=revision) + path = resolved_path.unresolve() + expand_info = kwargs.get( + "expand_info", self.expand_info if self.expand_info is not None else False + ) # don't expose it as a parameter in the public API to follow the spec + if not resolved_path.path_in_repo: + # Path is the root directory + out = { + "name": path, + "size": 0, + "type": "directory", + "last_commit": None, + } + if expand_info: + last_commit = self._api.list_repo_commits( + resolved_path.repo_id, repo_type=resolved_path.repo_type, revision=resolved_path.revision + )[-1] + out = { + **out, + "tree_id": None, # TODO: tree_id of the root directory? + "last_commit": LastCommitInfo( + oid=last_commit.commit_id, title=last_commit.title, date=last_commit.created_at + ), + } + else: + out = None + parent_path = self._parent(path) + if not expand_info and parent_path not in self.dircache: + # Fill the cache with cheap call + self.ls(parent_path) + if parent_path in self.dircache: + # Check if the path is in the cache + out1 = [o for o in self.dircache[parent_path] if o["name"] == path] + if not out1: + _raise_file_not_found(path, None) + out = out1[0] + if refresh or out is None or (expand_info and out and out["last_commit"] is None): + paths_info = self._api.get_paths_info( + resolved_path.repo_id, + resolved_path.path_in_repo, + expand=expand_info, + revision=resolved_path.revision, + repo_type=resolved_path.repo_type, + ) + if not paths_info: + _raise_file_not_found(path, None) + path_info = paths_info[0] + root_path = HfFileSystemResolvedPath( + resolved_path.repo_type, + resolved_path.repo_id, + resolved_path.revision, + path_in_repo="", + _raw_revision=resolved_path._raw_revision, + ).unresolve() + if isinstance(path_info, RepoFile): + out = { + "name": root_path + "/" + path_info.path, + "size": path_info.size, + "type": "file", + "blob_id": path_info.blob_id, + "lfs": path_info.lfs, + "last_commit": path_info.last_commit, + "security": path_info.security, + } + else: + out = { + "name": root_path + "/" + path_info.path, + "size": 0, + "type": "directory", + "tree_id": path_info.tree_id, + "last_commit": path_info.last_commit, + } + if not expand_info: + out = {k: out[k] for k in ["name", "size", "type"]} + assert out is not None + return out + + def exists(self, path, **kwargs): + """ + Check if a file exists. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists). + + > [!WARNING] + > Note: When possible, use `HfApi.file_exists()` for better performance. + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if file exists, False otherwise. + """ + try: + if kwargs.get("refresh", False): + self.invalidate_cache(path) + + self.info(path, **kwargs) + return True + except: # noqa: E722 + return False + + def isdir(self, path): + """ + Check if a path is a directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.isdir). + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if path is a directory, False otherwise. + """ + try: + return self.info(path)["type"] == "directory" + except OSError: + return False + + def isfile(self, path): + """ + Check if a path is a file. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.isfile). + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if path is a file, False otherwise. + """ + try: + return self.info(path)["type"] == "file" + except: # noqa: E722 + return False + + def url(self, path: str) -> str: + """ + Get the HTTP URL of the given path. + + Args: + path (`str`): + Path to get URL for. + + Returns: + `str`: HTTP URL to access the file or directory on the Hub. + """ + resolved_path = self.resolve_path(path) + url = hf_hub_url( + resolved_path.repo_id, + resolved_path.path_in_repo, + repo_type=resolved_path.repo_type, + revision=resolved_path.revision, + endpoint=self.endpoint, + ) + if self.isdir(path): + url = url.replace("/resolve/", "/tree/", 1) + return url + + def get_file(self, rpath, lpath, callback=_DEFAULT_CALLBACK, outfile=None, **kwargs) -> None: + """ + Copy single remote file to local. + + > [!WARNING] + > Note: When possible, use `HfApi.hf_hub_download()` for better performance. + + Args: + rpath (`str`): + Remote path to download from. + lpath (`str`): + Local path to download to. + callback (`Callback`, *optional*): + Optional callback to track download progress. Defaults to no callback. + outfile (`IO`, *optional*): + Optional file-like object to write to. If provided, `lpath` is ignored. + + """ + revision = kwargs.get("revision") + unhandled_kwargs = set(kwargs.keys()) - {"revision"} + if not isinstance(callback, (NoOpCallback, TqdmCallback)) or len(unhandled_kwargs) > 0: + # for now, let's not handle custom callbacks + # and let's not handle custom kwargs + return super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs) + + # Taken from https://github.com/fsspec/filesystem_spec/blob/47b445ae4c284a82dd15e0287b1ffc410e8fc470/fsspec/spec.py#L883 + if isfilelike(lpath): + outfile = lpath + elif self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) + return None + + if isinstance(lpath, (str, Path)): # otherwise, let's assume it's a file-like object + os.makedirs(os.path.dirname(lpath), exist_ok=True) + + # Open file if not already open + close_file = False + if outfile is None: + outfile = open(lpath, "wb") + close_file = True + initial_pos = outfile.tell() + + # Custom implementation of `get_file` to use `http_get`. + resolve_remote_path = self.resolve_path(rpath, revision=revision) + expected_size = self.info(rpath, revision=revision)["size"] + callback.set_size(expected_size) + try: + http_get( + url=hf_hub_url( + repo_id=resolve_remote_path.repo_id, + revision=resolve_remote_path.revision, + filename=resolve_remote_path.path_in_repo, + repo_type=resolve_remote_path.repo_type, + endpoint=self.endpoint, + ), + temp_file=outfile, # type: ignore[arg-type] + displayed_filename=rpath, + expected_size=expected_size, + resume_size=0, + headers=self._api._build_hf_headers(), + _tqdm_bar=callback.tqdm if isinstance(callback, TqdmCallback) else None, + ) + outfile.seek(initial_pos) + finally: + # Close file only if we opened it ourselves + if close_file: + outfile.close() + + @property + def transaction(self): + """A context within which files are committed together upon exit + + Requires the file class to implement `.commit()` and `.discard()` + for the normal and exception cases. + """ + # Taken from https://github.com/fsspec/filesystem_spec/blob/3fbb6fee33b46cccb015607630843dea049d3243/fsspec/spec.py#L231 + # See https://github.com/huggingface/huggingface_hub/issues/1733 + raise NotImplementedError("Transactional commits are not supported.") + + def start_transaction(self): + """Begin write transaction for deferring files, non-context version""" + # Taken from https://github.com/fsspec/filesystem_spec/blob/3fbb6fee33b46cccb015607630843dea049d3243/fsspec/spec.py#L241 + # See https://github.com/huggingface/huggingface_hub/issues/1733 + raise NotImplementedError("Transactional commits are not supported.") + + def __reduce__(self): + # re-populate the instance cache at HfFileSystem._cache and re-populate the state of every instance + return make_instance, ( + type(self), + self.storage_args, + self.storage_options, + self._get_instance_state(), + ) + + def _get_instance_state(self): + return { + "dircache": deepcopy(self.dircache), + "_repo_and_revision_exists_cache": deepcopy(self._repo_and_revision_exists_cache), + } + + +class HfFileSystemFile(fsspec.spec.AbstractBufferedFile): + def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs): + try: + self.resolved_path = fs.resolve_path(path, revision=revision) + except FileNotFoundError as e: + if "w" in kwargs.get("mode", ""): + raise FileNotFoundError( + f"{e}.\nMake sure the repository and revision exist before writing data." + ) from e + raise + super().__init__(fs, self.resolved_path.unresolve(), **kwargs) + self.fs: HfFileSystem + + def __del__(self): + if not hasattr(self, "resolved_path"): + # Means that the constructor failed. Nothing to do. + return + return super().__del__() + + def _fetch_range(self, start: int, end: int) -> bytes: + headers = { + "range": f"bytes={start}-{end - 1}", + **self.fs._api._build_hf_headers(), + } + url = hf_hub_url( + repo_id=self.resolved_path.repo_id, + revision=self.resolved_path.revision, + filename=self.resolved_path.path_in_repo, + repo_type=self.resolved_path.repo_type, + endpoint=self.fs.endpoint, + ) + r = http_backoff("GET", url, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT) + hf_raise_for_status(r) + return r.content + + def _initiate_upload(self) -> None: + self.temp_file = tempfile.NamedTemporaryFile(prefix="hffs-", delete=False) + + def _upload_chunk(self, final: bool = False) -> None: + self.buffer.seek(0) + block = self.buffer.read() + self.temp_file.write(block) + if final: + self.temp_file.close() + self.fs._api.upload_file( + path_or_fileobj=self.temp_file.name, + path_in_repo=self.resolved_path.path_in_repo, + repo_id=self.resolved_path.repo_id, + token=self.fs.token, + repo_type=self.resolved_path.repo_type, + revision=self.resolved_path.revision, + commit_message=self.kwargs.get("commit_message"), + commit_description=self.kwargs.get("commit_description"), + ) + os.remove(self.temp_file.name) + self.fs.invalidate_cache( + path=self.resolved_path.unresolve(), + ) + + def read(self, length=-1): + """Read remote file. + + If `length` is not provided or is -1, the entire file is downloaded and read. On POSIX systems the file is + loaded in memory directly. Otherwise, the file is downloaded to a temporary file and read from there. + """ + if self.mode == "rb" and (length is None or length == -1) and self.loc == 0: + with self.fs.open(self.path, "rb", block_size=0) as f: # block_size=0 enables fast streaming + out = f.read() + self.loc += len(out) + return out + return super().read(length) + + def url(self) -> str: + return self.fs.url(self.path) + + +class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile): + def __init__( + self, + fs: HfFileSystem, + path: str, + mode: str = "rb", + revision: Optional[str] = None, + block_size: int = 0, + cache_type: str = "none", + **kwargs, + ): + if block_size != 0: + raise ValueError(f"HfFileSystemStreamFile only supports block_size=0 but got {block_size}") + if cache_type != "none": + raise ValueError(f"HfFileSystemStreamFile only supports cache_type='none' but got {cache_type}") + if "w" in mode: + raise ValueError(f"HfFileSystemStreamFile only supports reading but got mode='{mode}'") + try: + self.resolved_path = fs.resolve_path(path, revision=revision) + except FileNotFoundError as e: + if "w" in kwargs.get("mode", ""): + raise FileNotFoundError( + f"{e}.\nMake sure the repository and revision exist before writing data." + ) from e + # avoid an unnecessary .info() call to instantiate .details + self.details = {"name": self.resolved_path.unresolve(), "size": None} + super().__init__( + fs, self.resolved_path.unresolve(), mode=mode, block_size=block_size, cache_type=cache_type, **kwargs + ) + self.response: Optional[httpx.Response] = None + self.fs: HfFileSystem + self._exit_stack = ExitStack() + + def seek(self, loc: int, whence: int = 0): + if loc == 0 and whence == 1: + return + if loc == self.loc and whence == 0: + return + raise ValueError("Cannot seek streaming HF file") + + def read(self, length: int = -1): + """Read the remote file. + + If the file is already open, we reuse the connection. + Otherwise, open a new connection and read from it. + + If reading the stream fails, we retry with a new connection. + """ + if self.response is None: + self._open_connection() + + retried_once = False + while True: + try: + if self.response is None: + return b"" # Already read the entire file + out = _partial_read(self.response, length) + self.loc += len(out) + return out + except Exception: + if self.response is not None: + self.response.close() + if retried_once: # Already retried once, give up + raise + # First failure, retry with range header + self._open_connection() + retried_once = True + + def url(self) -> str: + return self.fs.url(self.path) + + def __del__(self): + if not hasattr(self, "resolved_path"): + # Means that the constructor failed. Nothing to do. + return + self._exit_stack.close() + return super().__del__() + + def __reduce__(self): + return reopen, (self.fs, self.path, self.mode, self.blocksize, self.cache.name) + + def _open_connection(self): + """Open a connection to the remote file.""" + url = hf_hub_url( + repo_id=self.resolved_path.repo_id, + revision=self.resolved_path.revision, + filename=self.resolved_path.path_in_repo, + repo_type=self.resolved_path.repo_type, + endpoint=self.fs.endpoint, + ) + headers = self.fs._api._build_hf_headers() + if self.loc > 0: + headers["Range"] = f"bytes={self.loc}-" + self.response = self._exit_stack.enter_context( + http_stream_backoff( + "GET", + url, + headers=headers, + timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT, + ) + ) + + try: + hf_raise_for_status(self.response) + except HfHubHTTPError as e: + if e.response.status_code == 416: + # Range not satisfiable => means that we have already read the entire file + self.response = None + return + raise + + +def safe_revision(revision: str) -> str: + return revision if SPECIAL_REFS_REVISION_REGEX.match(revision) else safe_quote(revision) + + +def safe_quote(s: str) -> str: + return quote(s, safe="") + + +def _raise_file_not_found(path: str, err: Optional[Exception]) -> NoReturn: + msg = path + if isinstance(err, RepositoryNotFoundError): + msg = f"{path} (repository not found)" + elif isinstance(err, RevisionNotFoundError): + msg = f"{path} (revision not found)" + elif isinstance(err, HFValidationError): + msg = f"{path} (invalid repository id)" + raise FileNotFoundError(msg) from err + + +def reopen(fs: HfFileSystem, path: str, mode: str, block_size: int, cache_type: str): + return fs.open(path, mode=mode, block_size=block_size, cache_type=cache_type) + + +def _partial_read(response: httpx.Response, length: int = -1) -> bytes: + """ + Read up to `length` bytes from a streamed response. + If length == -1, read until EOF. + """ + buf = bytearray() + if length < -1: + raise ValueError("length must be -1 or >= 0") + if length == 0: + return b"" + if length == -1: + for chunk in response.iter_bytes(): + buf.extend(chunk) + return bytes(buf) + + for chunk in response.iter_bytes(chunk_size=length): + buf.extend(chunk) + if len(buf) >= length: + return bytes(buf[:length]) + + return bytes(buf) # may be < length if response ended + + +def make_instance(cls, args, kwargs, instance_state): + fs = cls(*args, **kwargs) + for attr, state_value in instance_state.items(): + setattr(fs, attr, state_value) + return fs + + +hffs = HfFileSystem() diff --git a/env/lib/python3.13/site-packages/huggingface_hub/repocard.py b/env/lib/python3.13/site-packages/huggingface_hub/repocard.py new file mode 100644 index 0000000000000000000000000000000000000000..683162c9a666659e8c35c7a9e3c57c824c0e3e83 --- /dev/null +++ b/env/lib/python3.13/site-packages/huggingface_hub/repocard.py @@ -0,0 +1,826 @@ +import os +import re +from pathlib import Path +from typing import Any, Literal, Optional, Union + +import yaml + +from huggingface_hub.file_download import hf_hub_download +from huggingface_hub.hf_api import upload_file +from huggingface_hub.repocard_data import ( + CardData, + DatasetCardData, + EvalResult, + ModelCardData, + SpaceCardData, + eval_results_to_model_index, + model_index_to_eval_results, +) +from huggingface_hub.utils import HfHubHTTPError, get_session, hf_raise_for_status, is_jinja_available, yaml_dump + +from . import constants +from .errors import EntryNotFoundError +from .utils import SoftTemporaryDirectory, logging, validate_hf_hub_args + + +logger = logging.get_logger(__name__) + + +TEMPLATE_MODELCARD_PATH = Path(__file__).parent / "templates" / "modelcard_template.md" +TEMPLATE_DATASETCARD_PATH = Path(__file__).parent / "templates" / "datasetcard_template.md" + +# exact same regex as in the Hub server. Please keep in sync. +# See https://github.com/huggingface/moon-landing/blob/main/server/lib/ViewMarkdown.ts#L18 +REGEX_YAML_BLOCK = re.compile(r"^(\s*---[\r\n]+)([\S\s]*?)([\r\n]+---(\r\n|\n|$))") + + +class RepoCard: + card_data_class = CardData + default_template_path = TEMPLATE_MODELCARD_PATH + repo_type = "model" + + def __init__(self, content: str, ignore_metadata_errors: bool = False): + """Initialize a RepoCard from string content. The content should be a + Markdown file with a YAML block at the beginning and a Markdown body. + + Args: + content (`str`): The content of the Markdown file. + + Example: + ```python + >>> from huggingface_hub.repocard import RepoCard + >>> text = ''' + ... --- + ... language: en + ... license: mit + ... --- + ... + ... # My repo + ... ''' + >>> card = RepoCard(text) + >>> card.data.to_dict() + {'language': 'en', 'license': 'mit'} + >>> card.text + '\\n# My repo\\n' + + ``` + > [!TIP] + > Raises the following error: + > + > - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + > when the content of the repo card metadata is not a dictionary. + """ + + # Set the content of the RepoCard, as well as underlying .data and .text attributes. + # See the `content` property setter for more details. + self.ignore_metadata_errors = ignore_metadata_errors + self.content = content + + @property + def content(self): + """The content of the RepoCard, including the YAML block and the Markdown body.""" + line_break = _detect_line_ending(self._content) or "\n" + return f"---{line_break}{self.data.to_yaml(line_break=line_break, original_order=self._original_order)}{line_break}---{line_break}{self.text}" + + @content.setter + def content(self, content: str): + """Set the content of the RepoCard.""" + self._content = content + + match = REGEX_YAML_BLOCK.search(content) + if match: + # Metadata found in the YAML block + yaml_block = match.group(2) + self.text = content[match.end() :] + data_dict = yaml.safe_load(yaml_block) + + if data_dict is None: + data_dict = {} + + # The YAML block's data should be a dictionary + if not isinstance(data_dict, dict): + raise ValueError("repo card metadata block should be a dict") + else: + # Model card without metadata... create empty metadata + logger.warning("Repo card metadata block was not found. Setting CardData to empty.") + data_dict = {} + self.text = content + + self.data = self.card_data_class(**data_dict, ignore_metadata_errors=self.ignore_metadata_errors) + self._original_order = list(data_dict.keys()) + + def __str__(self): + return self.content + + def save(self, filepath: Union[Path, str]): + r"""Save a RepoCard to a file. + + Args: + filepath (`Union[Path, str]`): Filepath to the markdown file to save. + + Example: + ```python + >>> from huggingface_hub.repocard import RepoCard + >>> card = RepoCard("---\nlanguage: en\n---\n# This is a test repo card") + >>> card.save("/tmp/test.md") + + ``` + """ + filepath = Path(filepath) + filepath.parent.mkdir(parents=True, exist_ok=True) + # Preserve newlines as in the existing file. + with open(filepath, mode="w", newline="", encoding="utf-8") as f: + f.write(str(self)) + + @classmethod + def load( + cls, + repo_id_or_path: Union[str, Path], + repo_type: Optional[str] = None, + token: Optional[str] = None, + ignore_metadata_errors: bool = False, + ): + """Initialize a RepoCard from a Hugging Face Hub repo's README.md or a local filepath. + + Args: + repo_id_or_path (`Union[str, Path]`): + The repo ID associated with a Hugging Face Hub repo or a local filepath. + repo_type (`str`, *optional*): + The type of Hugging Face repo to push to. Defaults to None, which will use "model". Other options + are "dataset" and "space". Not used when loading from a local filepath. If this is called from a child + class, the default value will be the child class's `repo_type`. + token (`str`, *optional*): + Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to the stored token. + ignore_metadata_errors (`str`): + If True, errors while parsing the metadata section will be ignored. Some information might be lost during + the process. Use it at your own risk. + + Returns: + [`huggingface_hub.repocard.RepoCard`]: The RepoCard (or subclass) initialized from the repo's + README.md file or filepath. + + Example: + ```python + >>> from huggingface_hub.repocard import RepoCard + >>> card = RepoCard.load("nateraw/food") + >>> assert card.data.tags == ["generated_from_trainer", "image-classification", "pytorch"] + + ``` + """ + + if Path(repo_id_or_path).is_file(): + card_path = Path(repo_id_or_path) + elif isinstance(repo_id_or_path, str): + card_path = Path( + hf_hub_download( + repo_id_or_path, + constants.REPOCARD_NAME, + repo_type=repo_type or cls.repo_type, + token=token, + ) + ) + else: + raise ValueError(f"Cannot load RepoCard: path not found on disk ({repo_id_or_path}).") + + # Preserve newlines in the existing file. + with card_path.open(mode="r", newline="", encoding="utf-8") as f: + return cls(f.read(), ignore_metadata_errors=ignore_metadata_errors) + + def validate(self, repo_type: Optional[str] = None): + """Validates card against Hugging Face Hub's card validation logic. + Using this function requires access to the internet, so it is only called + internally by [`huggingface_hub.repocard.RepoCard.push_to_hub`]. + + Args: + repo_type (`str`, *optional*, defaults to "model"): + The type of Hugging Face repo to push to. Options are "model", "dataset", and "space". + If this function is called from a child class, the default will be the child class's `repo_type`. + + > [!TIP] + > Raises the following errors: + > + > - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + > if the card fails validation checks. + > - [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError) + > if the request to the Hub API fails for any other reason. + """ + + # If repo type is provided, otherwise, use the repo type of the card. + repo_type = repo_type or self.repo_type + + body = { + "repoType": repo_type, + "content": str(self), + } + headers = {"Accept": "text/plain"} + + try: + response = get_session().post("https://huggingface.co/api/validate-yaml", json=body, headers=headers) + hf_raise_for_status(response) + except HfHubHTTPError as exc: + if response.status_code == 400: + raise ValueError(response.text) + else: + raise exc + + def push_to_hub( + self, + repo_id: str, + token: Optional[str] = None, + repo_type: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = None, + parent_commit: Optional[str] = None, + ): + """Push a RepoCard to a Hugging Face Hub repo. + + Args: + repo_id (`str`): + The repo ID of the Hugging Face Hub repo to push to. Example: "nateraw/food". + token (`str`, *optional*): + Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to + the stored token. + repo_type (`str`, *optional*, defaults to "model"): + The type of Hugging Face repo to push to. Options are "model", "dataset", and "space". If this + function is called by a child class, it will default to the child class's `repo_type`. + commit_message (`str`, *optional*): + The summary / title / first line of the generated commit. + commit_description (`str`, *optional*) + The description of the generated commit. + revision (`str`, *optional*): + The git revision to commit from. Defaults to the head of the `"main"` branch. + create_pr (`bool`, *optional*): + Whether or not to create a Pull Request with this commit. Defaults to `False`. + parent_commit (`str`, *optional*): + The OID / SHA of the parent commit, as a hexadecimal string. Shorthands (7 first characters) are also supported. + If specified and `create_pr` is `False`, the commit will fail if `revision` does not point to `parent_commit`. + If specified and `create_pr` is `True`, the pull request will be created from `parent_commit`. + Specifying `parent_commit` ensures the repo has not changed before committing the changes, and can be + especially useful if the repo is updated / committed too concurrently. + Returns: + `str`: URL of the commit which updated the card metadata. + """ + + # If repo type is provided, otherwise, use the repo type of the card. + repo_type = repo_type or self.repo_type + + # Validate card before pushing to hub + self.validate(repo_type=repo_type) + + with SoftTemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) / constants.REPOCARD_NAME + tmp_path.write_text(str(self), encoding="utf-8") + url = upload_file( + path_or_fileobj=str(tmp_path), + path_in_repo=constants.REPOCARD_NAME, + repo_id=repo_id, + token=token, + repo_type=repo_type, + commit_message=commit_message, + commit_description=commit_description, + create_pr=create_pr, + revision=revision, + parent_commit=parent_commit, + ) + return url + + @classmethod + def from_template( + cls, + card_data: CardData, + template_path: Optional[str] = None, + template_str: Optional[str] = None, + **template_kwargs, + ): + """Initialize a RepoCard from a template. By default, it uses the default template. + + Templates are Jinja2 templates that can be customized by passing keyword arguments. + + Args: + card_data (`huggingface_hub.CardData`): + A huggingface_hub.CardData instance containing the metadata you want to include in the YAML + header of the repo card on the Hugging Face Hub. + template_path (`str`, *optional*): + A path to a markdown file with optional Jinja template variables that can be filled + in with `template_kwargs`. Defaults to the default template. + + Returns: + [`huggingface_hub.repocard.RepoCard`]: A RepoCard instance with the specified card data and content from the + template. + """ + if is_jinja_available(): + import jinja2 + else: + raise ImportError( + "Using RepoCard.from_template requires Jinja2 to be installed. Please" + " install it with `pip install Jinja2`." + ) + + kwargs = card_data.to_dict().copy() + kwargs.update(template_kwargs) # Template_kwargs have priority + + if template_path is not None: + template_str = Path(template_path).read_text() + if template_str is None: + template_str = Path(cls.default_template_path).read_text() + template = jinja2.Template(template_str) + content = template.render(card_data=card_data.to_yaml(), **kwargs) + return cls(content) + + +class ModelCard(RepoCard): + card_data_class = ModelCardData # type: ignore[assignment] + default_template_path = TEMPLATE_MODELCARD_PATH + repo_type = "model" + + @classmethod + def from_template( # type: ignore # violates Liskov property but easier to use + cls, + card_data: ModelCardData, + template_path: Optional[str] = None, + template_str: Optional[str] = None, + **template_kwargs, + ): + """Initialize a ModelCard from a template. By default, it uses the default template, which can be found here: + https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/templates/modelcard_template.md + + Templates are Jinja2 templates that can be customized by passing keyword arguments. + + Args: + card_data (`huggingface_hub.ModelCardData`): + A huggingface_hub.ModelCardData instance containing the metadata you want to include in the YAML + header of the model card on the Hugging Face Hub. + template_path (`str`, *optional*): + A path to a markdown file with optional Jinja template variables that can be filled + in with `template_kwargs`. Defaults to the default template. + + Returns: + [`huggingface_hub.ModelCard`]: A ModelCard instance with the specified card data and content from the + template. + + Example: + ```python + >>> from huggingface_hub import ModelCard, ModelCardData, EvalResult + + >>> # Using the Default Template + >>> card_data = ModelCardData( + ... language='en', + ... license='mit', + ... library_name='timm', + ... tags=['image-classification', 'resnet'], + ... datasets=['beans'], + ... metrics=['accuracy'], + ... ) + >>> card = ModelCard.from_template( + ... card_data, + ... model_description='This model does x + y...' + ... ) + + >>> # Including Evaluation Results + >>> card_data = ModelCardData( + ... language='en', + ... tags=['image-classification', 'resnet'], + ... eval_results=[ + ... EvalResult( + ... task_type='image-classification', + ... dataset_type='beans', + ... dataset_name='Beans', + ... metric_type='accuracy', + ... metric_value=0.9, + ... ), + ... ], + ... model_name='my-cool-model', + ... ) + >>> card = ModelCard.from_template(card_data) + + >>> # Using a Custom Template + >>> card_data = ModelCardData( + ... language='en', + ... tags=['image-classification', 'resnet'] + ... ) + >>> card = ModelCard.from_template( + ... card_data=card_data, + ... template_path='./src/huggingface_hub/templates/modelcard_template.md', + ... custom_template_var='custom value', # will be replaced in template if it exists + ... ) + + ``` + """ + return super().from_template(card_data, template_path, template_str, **template_kwargs) + + +class DatasetCard(RepoCard): + card_data_class = DatasetCardData # type: ignore[assignment] + default_template_path = TEMPLATE_DATASETCARD_PATH + repo_type = "dataset" + + @classmethod + def from_template( # type: ignore # violates Liskov property but easier to use + cls, + card_data: DatasetCardData, + template_path: Optional[str] = None, + template_str: Optional[str] = None, + **template_kwargs, + ): + """Initialize a DatasetCard from a template. By default, it uses the default template, which can be found here: + https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/templates/datasetcard_template.md + + Templates are Jinja2 templates that can be customized by passing keyword arguments. + + Args: + card_data (`huggingface_hub.DatasetCardData`): + A huggingface_hub.DatasetCardData instance containing the metadata you want to include in the YAML + header of the dataset card on the Hugging Face Hub. + template_path (`str`, *optional*): + A path to a markdown file with optional Jinja template variables that can be filled + in with `template_kwargs`. Defaults to the default template. + + Returns: + [`huggingface_hub.DatasetCard`]: A DatasetCard instance with the specified card data and content from the + template. + + Example: + ```python + >>> from huggingface_hub import DatasetCard, DatasetCardData + + >>> # Using the Default Template + >>> card_data = DatasetCardData( + ... language='en', + ... license='mit', + ... annotations_creators='crowdsourced', + ... task_categories=['text-classification'], + ... task_ids=['sentiment-classification', 'text-scoring'], + ... multilinguality='monolingual', + ... pretty_name='My Text Classification Dataset', + ... ) + >>> card = DatasetCard.from_template( + ... card_data, + ... pretty_name=card_data.pretty_name, + ... ) + + >>> # Using a Custom Template + >>> card_data = DatasetCardData( + ... language='en', + ... license='mit', + ... ) + >>> card = DatasetCard.from_template( + ... card_data=card_data, + ... template_path='./src/huggingface_hub/templates/datasetcard_template.md', + ... custom_template_var='custom value', # will be replaced in template if it exists + ... ) + + ``` + """ + return super().from_template(card_data, template_path, template_str, **template_kwargs) + + +class SpaceCard(RepoCard): + card_data_class = SpaceCardData # type: ignore[assignment] + default_template_path = TEMPLATE_MODELCARD_PATH + repo_type = "space" + + +def _detect_line_ending(content: str) -> Literal["\r", "\n", "\r\n", None]: # noqa: F722 + """Detect the line ending of a string. Used by RepoCard to avoid making huge diff on newlines. + + Uses same implementation as in Hub server, keep it in sync. + + Returns: + str: The detected line ending of the string. + """ + cr = content.count("\r") + lf = content.count("\n") + crlf = content.count("\r\n") + if cr + lf == 0: + return None + if crlf == cr and crlf == lf: + return "\r\n" + if cr > lf: + return "\r" + else: + return "\n" + + +def metadata_load(local_path: Union[str, Path]) -> Optional[dict]: + content = Path(local_path).read_text() + match = REGEX_YAML_BLOCK.search(content) + if match: + yaml_block = match.group(2) + data = yaml.safe_load(yaml_block) + if data is None or isinstance(data, dict): + return data + raise ValueError("repo card metadata block should be a dict") + else: + return None + + +def metadata_save(local_path: Union[str, Path], data: dict) -> None: + """ + Save the metadata dict in the upper YAML part Trying to preserve newlines as + in the existing file. Docs about open() with newline="" parameter: + https://docs.python.org/3/library/functions.html?highlight=open#open Does + not work with "^M" linebreaks, which are replaced by \n + """ + line_break = "\n" + content = "" + # try to detect existing newline character + if os.path.exists(local_path): + with open(local_path, "r", newline="", encoding="utf8") as readme: + content = readme.read() + if isinstance(readme.newlines, tuple): + line_break = readme.newlines[0] + elif isinstance(readme.newlines, str): + line_break = readme.newlines + + # creates a new file if it not + with open(local_path, "w", newline="", encoding="utf8") as readme: + data_yaml = yaml_dump(data, sort_keys=False, line_break=line_break) + # sort_keys: keep dict order + match = REGEX_YAML_BLOCK.search(content) + if match: + output = content[: match.start()] + f"---{line_break}{data_yaml}---{line_break}" + content[match.end() :] + else: + output = f"---{line_break}{data_yaml}---{line_break}{content}" + + readme.write(output) + readme.close() + + +def metadata_eval_result( + *, + model_pretty_name: str, + task_pretty_name: str, + task_id: str, + metrics_pretty_name: str, + metrics_id: str, + metrics_value: Any, + dataset_pretty_name: str, + dataset_id: str, + metrics_config: Optional[str] = None, + metrics_verified: bool = False, + dataset_config: Optional[str] = None, + dataset_split: Optional[str] = None, + dataset_revision: Optional[str] = None, + metrics_verification_token: Optional[str] = None, +) -> dict: + """ + Creates a metadata dict with the result from a model evaluated on a dataset. + + Args: + model_pretty_name (`str`): + The name of the model in natural language. + task_pretty_name (`str`): + The name of a task in natural language. + task_id (`str`): + Example: automatic-speech-recognition. A task id. + metrics_pretty_name (`str`): + A name for the metric in natural language. Example: Test WER. + metrics_id (`str`): + Example: wer. A metric id from https://hf.co/metrics. + metrics_value (`Any`): + The value from the metric. Example: 20.0 or "20.0 ± 1.2". + dataset_pretty_name (`str`): + The name of the dataset in natural language. + dataset_id (`str`): + Example: common_voice. A dataset id from https://hf.co/datasets. + metrics_config (`str`, *optional*): + The name of the metric configuration used in `load_metric()`. + Example: bleurt-large-512 in `load_metric("bleurt", "bleurt-large-512")`. + metrics_verified (`bool`, *optional*, defaults to `False`): + Indicates whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not. Automatically computed by Hugging Face, do not set. + dataset_config (`str`, *optional*): + Example: fr. The name of the dataset configuration used in `load_dataset()`. + dataset_split (`str`, *optional*): + Example: test. The name of the dataset split used in `load_dataset()`. + dataset_revision (`str`, *optional*): + Example: 5503434ddd753f426f4b38109466949a1217c2bb. The name of the dataset dataset revision + used in `load_dataset()`. + metrics_verification_token (`bool`, *optional*): + A JSON Web Token that is used to verify whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not. + + Returns: + `dict`: a metadata dict with the result from a model evaluated on a dataset. + + Example: + ```python + >>> from huggingface_hub import metadata_eval_result + >>> results = metadata_eval_result( + ... model_pretty_name="RoBERTa fine-tuned on ReactionGIF", + ... task_pretty_name="Text Classification", + ... task_id="text-classification", + ... metrics_pretty_name="Accuracy", + ... metrics_id="accuracy", + ... metrics_value=0.2662102282047272, + ... dataset_pretty_name="ReactionJPEG", + ... dataset_id="julien-c/reactionjpeg", + ... dataset_config="default", + ... dataset_split="test", + ... ) + >>> results == { + ... 'model-index': [ + ... { + ... 'name': 'RoBERTa fine-tuned on ReactionGIF', + ... 'results': [ + ... { + ... 'task': { + ... 'type': 'text-classification', + ... 'name': 'Text Classification' + ... }, + ... 'dataset': { + ... 'name': 'ReactionJPEG', + ... 'type': 'julien-c/reactionjpeg', + ... 'config': 'default', + ... 'split': 'test' + ... }, + ... 'metrics': [ + ... { + ... 'type': 'accuracy', + ... 'value': 0.2662102282047272, + ... 'name': 'Accuracy', + ... 'verified': False + ... } + ... ] + ... } + ... ] + ... } + ... ] + ... } + True + + ``` + """ + + return { + "model-index": eval_results_to_model_index( + model_name=model_pretty_name, + eval_results=[ + EvalResult( + task_name=task_pretty_name, + task_type=task_id, + metric_name=metrics_pretty_name, + metric_type=metrics_id, + metric_value=metrics_value, + dataset_name=dataset_pretty_name, + dataset_type=dataset_id, + metric_config=metrics_config, + verified=metrics_verified, + verify_token=metrics_verification_token, + dataset_config=dataset_config, + dataset_split=dataset_split, + dataset_revision=dataset_revision, + ) + ], + ) + } + + +@validate_hf_hub_args +def metadata_update( + repo_id: str, + metadata: dict, + *, + repo_type: Optional[str] = None, + overwrite: bool = False, + token: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + parent_commit: Optional[str] = None, +) -> str: + """ + Updates the metadata in the README.md of a repository on the Hugging Face Hub. + If the README.md file doesn't exist yet, a new one is created with metadata and + the default ModelCard or DatasetCard template. For `space` repo, an error is thrown + as a Space cannot exist without a `README.md` file. + + Args: + repo_id (`str`): + The name of the repository. + metadata (`dict`): + A dictionary containing the metadata to be updated. + repo_type (`str`, *optional*): + Set to `"dataset"` or `"space"` if updating to a dataset or space, + `None` or `"model"` if updating to a model. Default is `None`. + overwrite (`bool`, *optional*, defaults to `False`): + If set to `True` an existing field can be overwritten, otherwise + attempting to overwrite an existing field will cause an error. + token (`str`, *optional*): + The Hugging Face authentication token. + commit_message (`str`, *optional*): + The summary / title / first line of the generated commit. Defaults to + `f"Update metadata with huggingface_hub"` + commit_description (`str` *optional*) + The description of the generated commit + revision (`str`, *optional*): + The git revision to commit from. Defaults to the head of the + `"main"` branch. + create_pr (`boolean`, *optional*): + Whether or not to create a Pull Request from `revision` with that commit. + Defaults to `False`. + parent_commit (`str`, *optional*): + The OID / SHA of the parent commit, as a hexadecimal string. Shorthands (7 first characters) are also supported. + If specified and `create_pr` is `False`, the commit will fail if `revision` does not point to `parent_commit`. + If specified and `create_pr` is `True`, the pull request will be created from `parent_commit`. + Specifying `parent_commit` ensures the repo has not changed before committing the changes, and can be + especially useful if the repo is updated / committed too concurrently. + Returns: + `str`: URL of the commit which updated the card metadata. + + Example: + ```python + >>> from huggingface_hub import metadata_update + >>> metadata = {'model-index': [{'name': 'RoBERTa fine-tuned on ReactionGIF', + ... 'results': [{'dataset': {'name': 'ReactionGIF', + ... 'type': 'julien-c/reactiongif'}, + ... 'metrics': [{'name': 'Recall', + ... 'type': 'recall', + ... 'value': 0.7762102282047272}], + ... 'task': {'name': 'Text Classification', + ... 'type': 'text-classification'}}]}]} + >>> url = metadata_update("hf-internal-testing/reactiongif-roberta-card", metadata) + + ``` + """ + commit_message = commit_message if commit_message is not None else "Update metadata with huggingface_hub" + + # Card class given repo_type + card_class: type[RepoCard] + if repo_type is None or repo_type == "model": + card_class = ModelCard + elif repo_type == "dataset": + card_class = DatasetCard + elif repo_type == "space": + card_class = RepoCard + else: + raise ValueError(f"Unknown repo_type: {repo_type}") + + # Either load repo_card from the Hub or create an empty one. + # NOTE: Will not create the repo if it doesn't exist. + try: + card = card_class.load(repo_id, token=token, repo_type=repo_type) + except EntryNotFoundError: + if repo_type == "space": + raise ValueError("Cannot update metadata on a Space that doesn't contain a `README.md` file.") + + # Initialize a ModelCard or DatasetCard from default template and no data. + # Cast to the concrete expected card type to satisfy type checkers. + card = card_class.from_template(CardData()) # type: ignore[return-value] + + for key, value in metadata.items(): + if key == "model-index": + # if the new metadata doesn't include a name, either use existing one or repo name + if "name" not in value[0]: + value[0]["name"] = getattr(card, "model_name", repo_id) + model_name, new_results = model_index_to_eval_results(value) + if card.data.eval_results is None: + card.data.eval_results = new_results + card.data.model_name = model_name + else: + existing_results = card.data.eval_results + + # Iterate over new results + # Iterate over existing results + # If both results describe the same metric but value is different: + # If overwrite=True: overwrite the metric value + # Else: raise ValueError + # Else: append new result to existing ones. + for new_result in new_results: + result_found = False + for existing_result in existing_results: + if new_result.is_equal_except_value(existing_result): + if new_result != existing_result and not overwrite: + raise ValueError( + "You passed a new value for the existing metric" + f" 'name: {new_result.metric_name}, type: " + f"{new_result.metric_type}'. Set `overwrite=True`" + " to overwrite existing metrics." + ) + result_found = True + existing_result.metric_value = new_result.metric_value + if existing_result.verified is True: + existing_result.verify_token = new_result.verify_token + if not result_found: + card.data.eval_results.append(new_result) + else: + # Any metadata that is not a result metric + if card.data.get(key) is not None and not overwrite and card.data.get(key) != value: + raise ValueError( + f"You passed a new value for the existing meta data field '{key}'." + " Set `overwrite=True` to overwrite existing metadata." + ) + else: + card.data[key] = value + + return card.push_to_hub( + repo_id, + token=token, + repo_type=repo_type, + commit_message=commit_message, + commit_description=commit_description, + create_pr=create_pr, + revision=revision, + parent_commit=parent_commit, + ) diff --git a/env/lib/python3.13/site-packages/tqdm/__init__.py b/env/lib/python3.13/site-packages/tqdm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8081f77b8812f3b42d7949daa4195d2c35dc70ac --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/__init__.py @@ -0,0 +1,38 @@ +from ._monitor import TMonitor, TqdmSynchronisationWarning +from ._tqdm_pandas import tqdm_pandas +from .cli import main # TODO: remove in v5.0.0 +from .gui import tqdm as tqdm_gui # TODO: remove in v5.0.0 +from .gui import trange as tgrange # TODO: remove in v5.0.0 +from .std import ( + TqdmDeprecationWarning, TqdmExperimentalWarning, TqdmKeyError, TqdmMonitorWarning, + TqdmTypeError, TqdmWarning, tqdm, trange) +from .version import __version__ + +__all__ = ['tqdm', 'tqdm_gui', 'trange', 'tgrange', 'tqdm_pandas', + 'tqdm_notebook', 'tnrange', 'main', 'TMonitor', + 'TqdmTypeError', 'TqdmKeyError', + 'TqdmWarning', 'TqdmDeprecationWarning', + 'TqdmExperimentalWarning', + 'TqdmMonitorWarning', 'TqdmSynchronisationWarning', + '__version__'] + + +def tqdm_notebook(*args, **kwargs): # pragma: no cover + """See tqdm.notebook.tqdm for full documentation""" + from warnings import warn + + from .notebook import tqdm as _tqdm_notebook + warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`", + TqdmDeprecationWarning, stacklevel=2) + return _tqdm_notebook(*args, **kwargs) + + +def tnrange(*args, **kwargs): # pragma: no cover + """Shortcut for `tqdm.notebook.tqdm(range(*args), **kwargs)`.""" + from warnings import warn + + from .notebook import trange as _tnrange + warn("Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`", + TqdmDeprecationWarning, stacklevel=2) + return _tnrange(*args, **kwargs) diff --git a/env/lib/python3.13/site-packages/tqdm/__main__.py b/env/lib/python3.13/site-packages/tqdm/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e28416e104515e90fca4b69cc60d0c61fd15d61 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/__main__.py @@ -0,0 +1,3 @@ +from .cli import main + +main() diff --git a/env/lib/python3.13/site-packages/tqdm/_dist_ver.py b/env/lib/python3.13/site-packages/tqdm/_dist_ver.py new file mode 100644 index 0000000000000000000000000000000000000000..61af7d5bb0b25d8dc934b45b18ea35bd32dbb465 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_dist_ver.py @@ -0,0 +1 @@ +__version__ = '4.67.1' diff --git a/env/lib/python3.13/site-packages/tqdm/_main.py b/env/lib/python3.13/site-packages/tqdm/_main.py new file mode 100644 index 0000000000000000000000000000000000000000..04fdeeff17b5cc84b210f445b54b87d5b99e3748 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_main.py @@ -0,0 +1,9 @@ +from warnings import warn + +from .cli import * # NOQA +from .cli import __all__ # NOQA +from .std import TqdmDeprecationWarning + +warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.cli.*` instead of `tqdm._main.*`", + TqdmDeprecationWarning, stacklevel=2) diff --git a/env/lib/python3.13/site-packages/tqdm/_monitor.py b/env/lib/python3.13/site-packages/tqdm/_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..f71aa56817ca77eba5df4a2dd11cb0c4a9a7ea1c --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_monitor.py @@ -0,0 +1,95 @@ +import atexit +from threading import Event, Thread, current_thread +from time import time +from warnings import warn + +__all__ = ["TMonitor", "TqdmSynchronisationWarning"] + + +class TqdmSynchronisationWarning(RuntimeWarning): + """tqdm multi-thread/-process errors which may cause incorrect nesting + but otherwise no adverse effects""" + pass + + +class TMonitor(Thread): + """ + Monitoring thread for tqdm bars. + Monitors if tqdm bars are taking too much time to display + and readjusts miniters automatically if necessary. + + Parameters + ---------- + tqdm_cls : class + tqdm class to use (can be core tqdm or a submodule). + sleep_interval : float + Time to sleep between monitoring checks. + """ + _test = {} # internal vars for unit testing + + def __init__(self, tqdm_cls, sleep_interval): + Thread.__init__(self) + self.daemon = True # kill thread when main killed (KeyboardInterrupt) + self.woken = 0 # last time woken up, to sync with monitor + self.tqdm_cls = tqdm_cls + self.sleep_interval = sleep_interval + self._time = self._test.get("time", time) + self.was_killed = self._test.get("Event", Event)() + atexit.register(self.exit) + self.start() + + def exit(self): + self.was_killed.set() + if self is not current_thread(): + self.join() + return self.report() + + def get_instances(self): + # returns a copy of started `tqdm_cls` instances + return [i for i in self.tqdm_cls._instances.copy() + # Avoid race by checking that the instance started + if hasattr(i, 'start_t')] + + def run(self): + cur_t = self._time() + while True: + # After processing and before sleeping, notify that we woke + # Need to be done just before sleeping + self.woken = cur_t + # Sleep some time... + self.was_killed.wait(self.sleep_interval) + # Quit if killed + if self.was_killed.is_set(): + return + # Then monitor! + # Acquire lock (to access _instances) + with self.tqdm_cls.get_lock(): + cur_t = self._time() + # Check tqdm instances are waiting too long to print + instances = self.get_instances() + for instance in instances: + # Check event in loop to reduce blocking time on exit + if self.was_killed.is_set(): + return + # Only if mininterval > 1 (else iterations are just slow) + # and last refresh exceeded maxinterval + if ( + instance.miniters > 1 + and (cur_t - instance.last_print_t) >= instance.maxinterval + ): + # force bypassing miniters on next iteration + # (dynamic_miniters adjusts mininterval automatically) + instance.miniters = 1 + # Refresh now! (works only for manual tqdm) + instance.refresh(nolock=True) + # Remove accidental long-lived strong reference + del instance + if instances != self.get_instances(): # pragma: nocover + warn("Set changed size during iteration" + + " (see https://github.com/tqdm/tqdm/issues/481)", + TqdmSynchronisationWarning, stacklevel=2) + # Remove accidental long-lived strong references + del instances + + def report(self): + return not self.was_killed.is_set() diff --git a/env/lib/python3.13/site-packages/tqdm/_tqdm.py b/env/lib/python3.13/site-packages/tqdm/_tqdm.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc4962774a4651db7a739a3f143633b6215a9bd --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_tqdm.py @@ -0,0 +1,9 @@ +from warnings import warn + +from .std import * # NOQA +from .std import __all__ # NOQA +from .std import TqdmDeprecationWarning + +warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.std.*` instead of `tqdm._tqdm.*`", + TqdmDeprecationWarning, stacklevel=2) diff --git a/env/lib/python3.13/site-packages/tqdm/_tqdm_gui.py b/env/lib/python3.13/site-packages/tqdm/_tqdm_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..f32aa894f54b3a5b47a0fbf4263c2fd20df56c9d --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_tqdm_gui.py @@ -0,0 +1,9 @@ +from warnings import warn + +from .gui import * # NOQA +from .gui import __all__ # NOQA +from .std import TqdmDeprecationWarning + +warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.gui.*` instead of `tqdm._tqdm_gui.*`", + TqdmDeprecationWarning, stacklevel=2) diff --git a/env/lib/python3.13/site-packages/tqdm/_tqdm_notebook.py b/env/lib/python3.13/site-packages/tqdm/_tqdm_notebook.py new file mode 100644 index 0000000000000000000000000000000000000000..f225fbf5b52d04987ccf68f4d5ee4b735e3158b0 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_tqdm_notebook.py @@ -0,0 +1,9 @@ +from warnings import warn + +from .notebook import * # NOQA +from .notebook import __all__ # NOQA +from .std import TqdmDeprecationWarning + +warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`", + TqdmDeprecationWarning, stacklevel=2) diff --git a/env/lib/python3.13/site-packages/tqdm/_tqdm_pandas.py b/env/lib/python3.13/site-packages/tqdm/_tqdm_pandas.py new file mode 100644 index 0000000000000000000000000000000000000000..c4fe6efdc603579e7f8acfa27ac10dccdf3e94ce --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_tqdm_pandas.py @@ -0,0 +1,24 @@ +import sys + +__author__ = "github.com/casperdcl" +__all__ = ['tqdm_pandas'] + + +def tqdm_pandas(tclass, **tqdm_kwargs): + """ + Registers the given `tqdm` instance with + `pandas.core.groupby.DataFrameGroupBy.progress_apply`. + """ + from tqdm import TqdmDeprecationWarning + + if isinstance(tclass, type) or (getattr(tclass, '__name__', '').startswith( + 'tqdm_')): # delayed adapter case + TqdmDeprecationWarning( + "Please use `tqdm.pandas(...)` instead of `tqdm_pandas(tqdm, ...)`.", + fp_write=getattr(tqdm_kwargs.get('file', None), 'write', sys.stderr.write)) + tclass.pandas(**tqdm_kwargs) + else: + TqdmDeprecationWarning( + "Please use `tqdm.pandas(...)` instead of `tqdm_pandas(tqdm(...))`.", + fp_write=getattr(tclass.fp, 'write', sys.stderr.write)) + type(tclass).pandas(deprecated_t=tclass) diff --git a/env/lib/python3.13/site-packages/tqdm/_utils.py b/env/lib/python3.13/site-packages/tqdm/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..385e849e106d1319fe21045f14eb0aa6552fb153 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/_utils.py @@ -0,0 +1,11 @@ +from warnings import warn + +from .std import TqdmDeprecationWarning +from .utils import ( # NOQA, pylint: disable=unused-import + CUR_OS, IS_NIX, IS_WIN, RE_ANSI, Comparable, FormatReplace, SimpleTextIOWrapper, + _environ_cols_wrapper, _is_ascii, _is_utf, _screen_shape_linux, _screen_shape_tput, + _screen_shape_windows, _screen_shape_wrapper, _supports_unicode, _term_move_up, colorama) + +warn("This function will be removed in tqdm==5.0.0\n" + "Please use `tqdm.utils.*` instead of `tqdm._utils.*`", + TqdmDeprecationWarning, stacklevel=2) diff --git a/env/lib/python3.13/site-packages/tqdm/asyncio.py b/env/lib/python3.13/site-packages/tqdm/asyncio.py new file mode 100644 index 0000000000000000000000000000000000000000..2d00a0a2e755f36068d079ccc12ca84d86ff42be --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/asyncio.py @@ -0,0 +1,93 @@ +""" +Asynchronous progressbar decorator for iterators. +Includes a default `range` iterator printing to `stderr`. + +Usage: +>>> from tqdm.asyncio import trange, tqdm +>>> async for i in trange(10): +... ... +""" +import asyncio +from sys import version_info + +from .std import tqdm as std_tqdm + +__author__ = {"github.com/": ["casperdcl"]} +__all__ = ['tqdm_asyncio', 'tarange', 'tqdm', 'trange'] + + +class tqdm_asyncio(std_tqdm): + """ + Asynchronous-friendly version of tqdm. + """ + def __init__(self, iterable=None, *args, **kwargs): + super().__init__(iterable, *args, **kwargs) + self.iterable_awaitable = False + if iterable is not None: + if hasattr(iterable, "__anext__"): + self.iterable_next = iterable.__anext__ + self.iterable_awaitable = True + elif hasattr(iterable, "__next__"): + self.iterable_next = iterable.__next__ + else: + self.iterable_iterator = iter(iterable) + self.iterable_next = self.iterable_iterator.__next__ + + def __aiter__(self): + return self + + async def __anext__(self): + try: + if self.iterable_awaitable: + res = await self.iterable_next() + else: + res = self.iterable_next() + self.update() + return res + except StopIteration: + self.close() + raise StopAsyncIteration + except BaseException: + self.close() + raise + + def send(self, *args, **kwargs): + return self.iterable.send(*args, **kwargs) + + @classmethod + def as_completed(cls, fs, *, loop=None, timeout=None, total=None, **tqdm_kwargs): + """ + Wrapper for `asyncio.as_completed`. + """ + if total is None: + total = len(fs) + kwargs = {} + if version_info[:2] < (3, 10): + kwargs['loop'] = loop + yield from cls(asyncio.as_completed(fs, timeout=timeout, **kwargs), + total=total, **tqdm_kwargs) + + @classmethod + async def gather(cls, *fs, loop=None, timeout=None, total=None, **tqdm_kwargs): + """ + Wrapper for `asyncio.gather`. + """ + async def wrap_awaitable(i, f): + return i, await f + + ifs = [wrap_awaitable(i, f) for i, f in enumerate(fs)] + res = [await f for f in cls.as_completed(ifs, loop=loop, timeout=timeout, + total=total, **tqdm_kwargs)] + return [i for _, i in sorted(res)] + + +def tarange(*args, **kwargs): + """ + A shortcut for `tqdm.asyncio.tqdm(range(*args), **kwargs)`. + """ + return tqdm_asyncio(range(*args), **kwargs) + + +# Aliases +tqdm = tqdm_asyncio +trange = tarange diff --git a/env/lib/python3.13/site-packages/tqdm/auto.py b/env/lib/python3.13/site-packages/tqdm/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..206c4409d5269594bdbab3a092ef6e09e7c01947 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/auto.py @@ -0,0 +1,40 @@ +""" +Enables multiple commonly used features. + +Method resolution order: + +- `tqdm.autonotebook` without import warnings +- `tqdm.asyncio` +- `tqdm.std` base class + +Usage: +>>> from tqdm.auto import trange, tqdm +>>> for i in trange(10): +... ... +""" +import warnings + +from .std import TqdmExperimentalWarning + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=TqdmExperimentalWarning) + from .autonotebook import tqdm as notebook_tqdm + +from .asyncio import tqdm as asyncio_tqdm +from .std import tqdm as std_tqdm + +if notebook_tqdm != std_tqdm: + class tqdm(notebook_tqdm, asyncio_tqdm): # pylint: disable=inconsistent-mro + pass +else: + tqdm = asyncio_tqdm + + +def trange(*args, **kwargs): + """ + A shortcut for `tqdm.auto.tqdm(range(*args), **kwargs)`. + """ + return tqdm(range(*args), **kwargs) + + +__all__ = ["tqdm", "trange"] diff --git a/env/lib/python3.13/site-packages/tqdm/autonotebook.py b/env/lib/python3.13/site-packages/tqdm/autonotebook.py new file mode 100644 index 0000000000000000000000000000000000000000..a09f2ec4b8c95f12b8c7b7774f84d5ec55826334 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/autonotebook.py @@ -0,0 +1,29 @@ +""" +Automatically choose between `tqdm.notebook` and `tqdm.std`. + +Usage: +>>> from tqdm.autonotebook import trange, tqdm +>>> for i in trange(10): +... ... +""" +import sys +from warnings import warn + +try: + get_ipython = sys.modules['IPython'].get_ipython + if 'IPKernelApp' not in get_ipython().config: # pragma: no cover + raise ImportError("console") + from .notebook import WARN_NOIPYW, IProgress + if IProgress is None: + from .std import TqdmWarning + warn(WARN_NOIPYW, TqdmWarning, stacklevel=2) + raise ImportError('ipywidgets') +except Exception: + from .std import tqdm, trange +else: # pragma: no cover + from .notebook import tqdm, trange + from .std import TqdmExperimentalWarning + warn("Using `tqdm.autonotebook.tqdm` in notebook mode." + " Use `tqdm.tqdm` instead to force console mode" + " (e.g. in jupyter console)", TqdmExperimentalWarning, stacklevel=2) +__all__ = ["tqdm", "trange"] diff --git a/env/lib/python3.13/site-packages/tqdm/cli.py b/env/lib/python3.13/site-packages/tqdm/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..e54a7fc8599fe0dfef12cd53b76b27ae51b68b4b --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/cli.py @@ -0,0 +1,324 @@ +""" +Module version for monitoring CLI pipes (`... | python -m tqdm | ...`). +""" +import logging +import re +import sys +from ast import literal_eval as numeric +from textwrap import indent + +from .std import TqdmKeyError, TqdmTypeError, tqdm +from .version import __version__ + +__all__ = ["main"] +log = logging.getLogger(__name__) + + +def cast(val, typ): + log.debug((val, typ)) + if " or " in typ: + for t in typ.split(" or "): + try: + return cast(val, t) + except TqdmTypeError: + pass + raise TqdmTypeError(f"{val} : {typ}") + + # sys.stderr.write('\ndebug | `val:type`: `' + val + ':' + typ + '`.\n') + if typ == 'bool': + if (val == 'True') or (val == ''): + return True + if val == 'False': + return False + raise TqdmTypeError(val + ' : ' + typ) + if typ == 'chr': + if len(val) == 1: + return val.encode() + if re.match(r"^\\\w+$", val): + return eval(f'"{val}"').encode() + raise TqdmTypeError(f"{val} : {typ}") + if typ == 'str': + return val + if typ == 'int': + try: + return int(val) + except ValueError as exc: + raise TqdmTypeError(f"{val} : {typ}") from exc + if typ == 'float': + try: + return float(val) + except ValueError as exc: + raise TqdmTypeError(f"{val} : {typ}") from exc + raise TqdmTypeError(f"{val} : {typ}") + + +def posix_pipe(fin, fout, delim=b'\\n', buf_size=256, + callback=lambda float: None, callback_len=True): + """ + Params + ------ + fin : binary file with `read(buf_size : int)` method + fout : binary file with `write` (and optionally `flush`) methods. + callback : function(float), e.g.: `tqdm.update` + callback_len : If (default: True) do `callback(len(buffer))`. + Otherwise, do `callback(data) for data in buffer.split(delim)`. + """ + fp_write = fout.write + + if not delim: + while True: + tmp = fin.read(buf_size) + + # flush at EOF + if not tmp: + getattr(fout, 'flush', lambda: None)() + return + + fp_write(tmp) + callback(len(tmp)) + # return + + buf = b'' + len_delim = len(delim) + # n = 0 + while True: + tmp = fin.read(buf_size) + + # flush at EOF + if not tmp: + if buf: + fp_write(buf) + if callback_len: + # n += 1 + buf.count(delim) + callback(1 + buf.count(delim)) + else: + for i in buf.split(delim): + callback(i) + getattr(fout, 'flush', lambda: None)() + return # n + + while True: + i = tmp.find(delim) + if i < 0: + buf += tmp + break + fp_write(buf + tmp[:i + len(delim)]) + # n += 1 + callback(1 if callback_len else (buf + tmp[:i])) + buf = b'' + tmp = tmp[i + len_delim:] + + +# ((opt, type), ... ) +RE_OPTS = re.compile(r'\n {4}(\S+)\s{2,}:\s*([^,]+)') +# better split method assuming no positional args +RE_SHLEX = re.compile(r'\s*(? : \2', d) + split = RE_OPTS.split(d) + opt_types_desc = zip(split[1::3], split[2::3], split[3::3]) + d = ''.join(('\n --{0} : {2}{3}' if otd[1] == 'bool' else + '\n --{0}=<{1}> : {2}{3}').format( + otd[0].replace('_', '-'), otd[0], *otd[1:]) + for otd in opt_types_desc if otd[0] not in UNSUPPORTED_OPTS) + + help_short = "Usage:\n tqdm [--help | options]\n" + d = help_short + """ +Options: + -h, --help Print this help and exit. + -v, --version Print version and exit. +""" + d.strip('\n') + '\n' + + # opts = docopt(d, version=__version__) + if any(v in argv for v in ('-v', '--version')): + sys.stdout.write(__version__ + '\n') + sys.exit(0) + elif any(v in argv for v in ('-h', '--help')): + sys.stdout.write(d + '\n') + sys.exit(0) + elif argv and argv[0][:2] != '--': + sys.stderr.write(f"Error:Unknown argument:{argv[0]}\n{help_short}") + + argv = RE_SHLEX.split(' '.join(["tqdm"] + argv)) + opts = dict(zip(argv[1::3], argv[3::3])) + + log.debug(opts) + opts.pop('log', True) + + tqdm_args = {'file': fp} + try: + for (o, v) in opts.items(): + o = o.replace('-', '_') + try: + tqdm_args[o] = cast(v, opt_types[o]) + except KeyError as e: + raise TqdmKeyError(str(e)) + log.debug('args:' + str(tqdm_args)) + + delim_per_char = tqdm_args.pop('bytes', False) + update = tqdm_args.pop('update', False) + update_to = tqdm_args.pop('update_to', False) + if sum((delim_per_char, update, update_to)) > 1: + raise TqdmKeyError("Can only have one of --bytes --update --update_to") + except Exception: + fp.write("\nError:\n" + help_short) + stdin, stdout_write = sys.stdin, sys.stdout.write + for i in stdin: + stdout_write(i) + raise + else: + buf_size = tqdm_args.pop('buf_size', 256) + delim = tqdm_args.pop('delim', b'\\n') + tee = tqdm_args.pop('tee', False) + manpath = tqdm_args.pop('manpath', None) + comppath = tqdm_args.pop('comppath', None) + if tqdm_args.pop('null', False): + class stdout(object): + @staticmethod + def write(_): + pass + else: + stdout = sys.stdout + stdout = getattr(stdout, 'buffer', stdout) + stdin = getattr(sys.stdin, 'buffer', sys.stdin) + if manpath or comppath: + try: # py<3.9 + import importlib_resources as resources + except ImportError: + from importlib import resources + from pathlib import Path + + def cp(name, dst): + """copy resource `name` to `dst`""" + fi = resources.files('tqdm') / name + dst.write_bytes(fi.read_bytes()) + log.info("written:%s", dst) + if manpath is not None: + cp('tqdm.1', Path(manpath) / 'tqdm.1') + if comppath is not None: + cp('completion.sh', Path(comppath) / 'tqdm_completion.sh') + sys.exit(0) + if tee: + stdout_write = stdout.write + fp_write = getattr(fp, 'buffer', fp).write + + class stdout(object): # pylint: disable=function-redefined + @staticmethod + def write(x): + with tqdm.external_write_mode(file=fp): + fp_write(x) + stdout_write(x) + if delim_per_char: + tqdm_args.setdefault('unit', 'B') + tqdm_args.setdefault('unit_scale', True) + tqdm_args.setdefault('unit_divisor', 1024) + log.debug(tqdm_args) + with tqdm(**tqdm_args) as t: + posix_pipe(stdin, stdout, '', buf_size, t.update) + elif delim == b'\\n': + log.debug(tqdm_args) + write = stdout.write + if update or update_to: + with tqdm(**tqdm_args) as t: + if update: + def callback(i): + t.update(numeric(i.decode())) + else: # update_to + def callback(i): + t.update(numeric(i.decode()) - t.n) + for i in stdin: + write(i) + callback(i) + else: + for i in tqdm(stdin, **tqdm_args): + write(i) + else: + log.debug(tqdm_args) + with tqdm(**tqdm_args) as t: + callback_len = False + if update: + def callback(i): + t.update(numeric(i.decode())) + elif update_to: + def callback(i): + t.update(numeric(i.decode()) - t.n) + else: + callback = t.update + callback_len = True + posix_pipe(stdin, stdout, delim, buf_size, callback, callback_len) diff --git a/env/lib/python3.13/site-packages/tqdm/completion.sh b/env/lib/python3.13/site-packages/tqdm/completion.sh new file mode 100644 index 0000000000000000000000000000000000000000..9f61c7f14bb8c1f6099b9eb75dce28ece6a7ae96 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/completion.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +_tqdm(){ + local cur prv + cur="${COMP_WORDS[COMP_CWORD]}" + prv="${COMP_WORDS[COMP_CWORD - 1]}" + + case ${prv} in + --bar_format|--buf_size|--colour|--comppath|--delay|--delim|--desc|--initial|--lock_args|--manpath|--maxinterval|--mininterval|--miniters|--ncols|--nrows|--position|--postfix|--smoothing|--total|--unit|--unit_divisor) + # await user input + ;; + "--log") + COMPREPLY=($(compgen -W 'CRITICAL FATAL ERROR WARN WARNING INFO DEBUG NOTSET' -- ${cur})) + ;; + *) + COMPREPLY=($(compgen -W '--ascii --bar_format --buf_size --bytes --colour --comppath --delay --delim --desc --disable --dynamic_ncols --help --initial --leave --lock_args --log --manpath --maxinterval --mininterval --miniters --ncols --nrows --null --position --postfix --smoothing --tee --total --unit --unit_divisor --unit_scale --update --update_to --version --write_bytes -h -v' -- ${cur})) + ;; + esac +} +complete -F _tqdm tqdm diff --git a/env/lib/python3.13/site-packages/tqdm/dask.py b/env/lib/python3.13/site-packages/tqdm/dask.py new file mode 100644 index 0000000000000000000000000000000000000000..57f1b668f59dc5991019eee34c7df3232a2c2cd7 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/dask.py @@ -0,0 +1,44 @@ +from functools import partial + +from dask.callbacks import Callback + +from .auto import tqdm as tqdm_auto + +__author__ = {"github.com/": ["casperdcl"]} +__all__ = ['TqdmCallback'] + + +class TqdmCallback(Callback): + """Dask callback for task progress.""" + def __init__(self, start=None, pretask=None, tqdm_class=tqdm_auto, + **tqdm_kwargs): + """ + Parameters + ---------- + tqdm_class : optional + `tqdm` class to use for bars [default: `tqdm.auto.tqdm`]. + tqdm_kwargs : optional + Any other arguments used for all bars. + """ + super().__init__(start=start, pretask=pretask) + if tqdm_kwargs: + tqdm_class = partial(tqdm_class, **tqdm_kwargs) + self.tqdm_class = tqdm_class + + def _start_state(self, _, state): + self.pbar = self.tqdm_class(total=sum( + len(state[k]) for k in ['ready', 'waiting', 'running', 'finished'])) + + def _posttask(self, *_, **__): + self.pbar.update() + + def _finish(self, *_, **__): + self.pbar.close() + + def display(self): + """Displays in the current cell in Notebooks.""" + container = getattr(self.bar, 'container', None) + if container is None: + return + from .notebook import display + display(container) diff --git a/env/lib/python3.13/site-packages/tqdm/gui.py b/env/lib/python3.13/site-packages/tqdm/gui.py new file mode 100644 index 0000000000000000000000000000000000000000..cb52fb91a8661f4c73edd352bbc6f21b877dcfee --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/gui.py @@ -0,0 +1,179 @@ +""" +Matplotlib GUI progressbar decorator for iterators. + +Usage: +>>> from tqdm.gui import trange, tqdm +>>> for i in trange(10): +... ... +""" +# future division is important to divide integers and get as +# a result precise floating numbers (instead of truncated int) +import re +from warnings import warn + +# to inherit from the tqdm class +from .std import TqdmExperimentalWarning +from .std import tqdm as std_tqdm + +# import compatibility functions and utilities + +__author__ = {"github.com/": ["casperdcl", "lrq3000"]} +__all__ = ['tqdm_gui', 'tgrange', 'tqdm', 'trange'] + + +class tqdm_gui(std_tqdm): # pragma: no cover + """Experimental Matplotlib GUI version of tqdm!""" + # TODO: @classmethod: write() on GUI? + def __init__(self, *args, **kwargs): + from collections import deque + + import matplotlib as mpl + import matplotlib.pyplot as plt + kwargs = kwargs.copy() + kwargs['gui'] = True + colour = kwargs.pop('colour', 'g') + super().__init__(*args, **kwargs) + + if self.disable: + return + + warn("GUI is experimental/alpha", TqdmExperimentalWarning, stacklevel=2) + self.mpl = mpl + self.plt = plt + + # Remember if external environment uses toolbars + self.toolbar = self.mpl.rcParams['toolbar'] + self.mpl.rcParams['toolbar'] = 'None' + + self.mininterval = max(self.mininterval, 0.5) + self.fig, ax = plt.subplots(figsize=(9, 2.2)) + # self.fig.subplots_adjust(bottom=0.2) + total = self.__len__() # avoids TypeError on None #971 + if total is not None: + self.xdata = [] + self.ydata = [] + self.zdata = [] + else: + self.xdata = deque([]) + self.ydata = deque([]) + self.zdata = deque([]) + self.line1, = ax.plot(self.xdata, self.ydata, color='b') + self.line2, = ax.plot(self.xdata, self.zdata, color='k') + ax.set_ylim(0, 0.001) + if total is not None: + ax.set_xlim(0, 100) + ax.set_xlabel("percent") + self.fig.legend((self.line1, self.line2), ("cur", "est"), + loc='center right') + # progressbar + self.hspan = plt.axhspan(0, 0.001, xmin=0, xmax=0, color=colour) + else: + # ax.set_xlim(-60, 0) + ax.set_xlim(0, 60) + ax.invert_xaxis() + ax.set_xlabel("seconds") + ax.legend(("cur", "est"), loc='lower left') + ax.grid() + # ax.set_xlabel('seconds') + ax.set_ylabel((self.unit if self.unit else "it") + "/s") + if self.unit_scale: + plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) + ax.yaxis.get_offset_text().set_x(-0.15) + + # Remember if external environment is interactive + self.wasion = plt.isinteractive() + plt.ion() + self.ax = ax + + def close(self): + if self.disable: + return + + self.disable = True + + with self.get_lock(): + self._instances.remove(self) + + # Restore toolbars + self.mpl.rcParams['toolbar'] = self.toolbar + # Return to non-interactive mode + if not self.wasion: + self.plt.ioff() + if self.leave: + self.display() + else: + self.plt.close(self.fig) + + def clear(self, *_, **__): + pass + + def display(self, *_, **__): + n = self.n + cur_t = self._time() + elapsed = cur_t - self.start_t + delta_it = n - self.last_print_n + delta_t = cur_t - self.last_print_t + + # Inline due to multiple calls + total = self.total + xdata = self.xdata + ydata = self.ydata + zdata = self.zdata + ax = self.ax + line1 = self.line1 + line2 = self.line2 + hspan = getattr(self, 'hspan', None) + # instantaneous rate + y = delta_it / delta_t + # overall rate + z = n / elapsed + # update line data + xdata.append(n * 100.0 / total if total else cur_t) + ydata.append(y) + zdata.append(z) + + # Discard old values + # xmin, xmax = ax.get_xlim() + # if (not total) and elapsed > xmin * 1.1: + if (not total) and elapsed > 66: + xdata.popleft() + ydata.popleft() + zdata.popleft() + + ymin, ymax = ax.get_ylim() + if y > ymax or z > ymax: + ymax = 1.1 * y + ax.set_ylim(ymin, ymax) + ax.figure.canvas.draw() + + if total: + line1.set_data(xdata, ydata) + line2.set_data(xdata, zdata) + if hspan: + hspan.set_xy((0, ymin)) + hspan.set_height(ymax - ymin) + hspan.set_width(n / total) + else: + t_ago = [cur_t - i for i in xdata] + line1.set_data(t_ago, ydata) + line2.set_data(t_ago, zdata) + + d = self.format_dict + # remove {bar} + d['bar_format'] = (d['bar_format'] or "{l_bar}{r_bar}").replace( + "{bar}", "") + msg = self.format_meter(**d) + if '' in msg: + msg = "".join(re.split(r'\|?\|?', msg, maxsplit=1)) + ax.set_title(msg, fontname="DejaVu Sans Mono", fontsize=11) + self.plt.pause(1e-9) + + +def tgrange(*args, **kwargs): + """Shortcut for `tqdm.gui.tqdm(range(*args), **kwargs)`.""" + return tqdm_gui(range(*args), **kwargs) + + +# Aliases +tqdm = tqdm_gui +trange = tgrange diff --git a/env/lib/python3.13/site-packages/tqdm/keras.py b/env/lib/python3.13/site-packages/tqdm/keras.py new file mode 100644 index 0000000000000000000000000000000000000000..cce9467c51a95388aaa502d1da9a42f3ebf0af24 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/keras.py @@ -0,0 +1,122 @@ +from copy import copy +from functools import partial + +from .auto import tqdm as tqdm_auto + +try: + import keras +except (ImportError, AttributeError) as e: + try: + from tensorflow import keras + except ImportError: + raise e +__author__ = {"github.com/": ["casperdcl"]} +__all__ = ['TqdmCallback'] + + +class TqdmCallback(keras.callbacks.Callback): + """Keras callback for epoch and batch progress.""" + @staticmethod + def bar2callback(bar, pop=None, delta=(lambda logs: 1)): + def callback(_, logs=None): + n = delta(logs) + if logs: + if pop: + logs = copy(logs) + [logs.pop(i, 0) for i in pop] + bar.set_postfix(logs, refresh=False) + bar.update(n) + + return callback + + def __init__(self, epochs=None, data_size=None, batch_size=None, verbose=1, + tqdm_class=tqdm_auto, **tqdm_kwargs): + """ + Parameters + ---------- + epochs : int, optional + data_size : int, optional + Number of training pairs. + batch_size : int, optional + Number of training pairs per batch. + verbose : int + 0: epoch, 1: batch (transient), 2: batch. [default: 1]. + Will be set to `0` unless both `data_size` and `batch_size` + are given. + tqdm_class : optional + `tqdm` class to use for bars [default: `tqdm.auto.tqdm`]. + tqdm_kwargs : optional + Any other arguments used for all bars. + """ + if tqdm_kwargs: + tqdm_class = partial(tqdm_class, **tqdm_kwargs) + self.tqdm_class = tqdm_class + self.epoch_bar = tqdm_class(total=epochs, unit='epoch') + self.on_epoch_end = self.bar2callback(self.epoch_bar) + if data_size and batch_size: + self.batches = batches = (data_size + batch_size - 1) // batch_size + else: + self.batches = batches = None + self.verbose = verbose + if verbose == 1: + self.batch_bar = tqdm_class(total=batches, unit='batch', leave=False) + self.on_batch_end = self.bar2callback( + self.batch_bar, pop=['batch', 'size'], + delta=lambda logs: logs.get('size', 1)) + + def on_train_begin(self, *_, **__): + params = self.params.get + auto_total = params('epochs', params('nb_epoch', None)) + if auto_total is not None and auto_total != self.epoch_bar.total: + self.epoch_bar.reset(total=auto_total) + + def on_epoch_begin(self, epoch, *_, **__): + if self.epoch_bar.n < epoch: + ebar = self.epoch_bar + ebar.n = ebar.last_print_n = ebar.initial = epoch + if self.verbose: + params = self.params.get + total = params('samples', params( + 'nb_sample', params('steps', None))) or self.batches + if self.verbose == 2: + if hasattr(self, 'batch_bar'): + self.batch_bar.close() + self.batch_bar = self.tqdm_class( + total=total, unit='batch', leave=True, + unit_scale=1 / (params('batch_size', 1) or 1)) + self.on_batch_end = self.bar2callback( + self.batch_bar, pop=['batch', 'size'], + delta=lambda logs: logs.get('size', 1)) + elif self.verbose == 1: + self.batch_bar.unit_scale = 1 / (params('batch_size', 1) or 1) + self.batch_bar.reset(total=total) + else: + raise KeyError('Unknown verbosity') + + def on_train_end(self, *_, **__): + if hasattr(self, 'batch_bar'): + self.batch_bar.close() + self.epoch_bar.close() + + def display(self): + """Displays in the current cell in Notebooks.""" + container = getattr(self.epoch_bar, 'container', None) + if container is None: + return + from .notebook import display + display(container) + batch_bar = getattr(self, 'batch_bar', None) + if batch_bar is not None: + display(batch_bar.container) + + @staticmethod + def _implements_train_batch_hooks(): + return True + + @staticmethod + def _implements_test_batch_hooks(): + return True + + @staticmethod + def _implements_predict_batch_hooks(): + return True diff --git a/env/lib/python3.13/site-packages/tqdm/notebook.py b/env/lib/python3.13/site-packages/tqdm/notebook.py new file mode 100644 index 0000000000000000000000000000000000000000..77b91bdd43183998fcb99e92dd4597ff7fc6c3fb --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/notebook.py @@ -0,0 +1,317 @@ +""" +IPython/Jupyter Notebook progressbar decorator for iterators. +Includes a default `range` iterator printing to `stderr`. + +Usage: +>>> from tqdm.notebook import trange, tqdm +>>> for i in trange(10): +... ... +""" +# import compatibility functions and utilities +import re +import sys +from html import escape +from weakref import proxy + +# to inherit from the tqdm class +from .std import tqdm as std_tqdm + +if True: # pragma: no cover + # import IPython/Jupyter base widget and display utilities + IPY = 0 + try: # IPython 4.x + import ipywidgets + IPY = 4 + except ImportError: # IPython 3.x / 2.x + IPY = 32 + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings( + 'ignore', message=".*The `IPython.html` package has been deprecated.*") + try: + import IPython.html.widgets as ipywidgets # NOQA: F401 + except ImportError: + pass + + try: # IPython 4.x / 3.x + if IPY == 32: + from IPython.html.widgets import HTML + from IPython.html.widgets import FloatProgress as IProgress + from IPython.html.widgets import HBox + IPY = 3 + else: + from ipywidgets import HTML + from ipywidgets import FloatProgress as IProgress + from ipywidgets import HBox + except ImportError: + try: # IPython 2.x + from IPython.html.widgets import HTML + from IPython.html.widgets import ContainerWidget as HBox + from IPython.html.widgets import FloatProgressWidget as IProgress + IPY = 2 + except ImportError: + IPY = 0 + IProgress = None + HBox = object + + try: + from IPython.display import display # , clear_output + except ImportError: + pass + +__author__ = {"github.com/": ["lrq3000", "casperdcl", "alexanderkuk"]} +__all__ = ['tqdm_notebook', 'tnrange', 'tqdm', 'trange'] +WARN_NOIPYW = ("IProgress not found. Please update jupyter and ipywidgets." + " See https://ipywidgets.readthedocs.io/en/stable" + "/user_install.html") + + +class TqdmHBox(HBox): + """`ipywidgets.HBox` with a pretty representation""" + def _json_(self, pretty=None): + pbar = getattr(self, 'pbar', None) + if pbar is None: + return {} + d = pbar.format_dict + if pretty is not None: + d["ascii"] = not pretty + return d + + def __repr__(self, pretty=False): + pbar = getattr(self, 'pbar', None) + if pbar is None: + return super().__repr__() + return pbar.format_meter(**self._json_(pretty)) + + def _repr_pretty_(self, pp, *_, **__): + pp.text(self.__repr__(True)) + + +class tqdm_notebook(std_tqdm): + """ + Experimental IPython/Jupyter Notebook widget using tqdm! + """ + @staticmethod + def status_printer(_, total=None, desc=None, ncols=None): + """ + Manage the printing of an IPython/Jupyter Notebook progress bar widget. + """ + # Fallback to text bar if there's no total + # DEPRECATED: replaced with an 'info' style bar + # if not total: + # return super(tqdm_notebook, tqdm_notebook).status_printer(file) + + # fp = file + + # Prepare IPython progress bar + if IProgress is None: # #187 #451 #558 #872 + raise ImportError(WARN_NOIPYW) + if total: + pbar = IProgress(min=0, max=total) + else: # No total? Show info style bar with no progress tqdm status + pbar = IProgress(min=0, max=1) + pbar.value = 1 + pbar.bar_style = 'info' + if ncols is None: + pbar.layout.width = "20px" + + ltext = HTML() + rtext = HTML() + if desc: + ltext.value = desc + container = TqdmHBox(children=[ltext, pbar, rtext]) + # Prepare layout + if ncols is not None: # use default style of ipywidgets + # ncols could be 100, "100px", "100%" + ncols = str(ncols) # ipywidgets only accepts string + try: + if int(ncols) > 0: # isnumeric and positive + ncols += 'px' + except ValueError: + pass + pbar.layout.flex = '2' + container.layout.width = ncols + container.layout.display = 'inline-flex' + container.layout.flex_flow = 'row wrap' + + return container + + def display(self, msg=None, pos=None, + # additional signals + close=False, bar_style=None, check_delay=True): + # Note: contrary to native tqdm, msg='' does NOT clear bar + # goal is to keep all infos if error happens so user knows + # at which iteration the loop failed. + + # Clear previous output (really necessary?) + # clear_output(wait=1) + + if not msg and not close: + d = self.format_dict + # remove {bar} + d['bar_format'] = (d['bar_format'] or "{l_bar}{r_bar}").replace( + "{bar}", "") + msg = self.format_meter(**d) + + ltext, pbar, rtext = self.container.children + pbar.value = self.n + + if msg: + msg = msg.replace(' ', u'\u2007') # fix html space padding + # html escape special characters (like '&') + if '' in msg: + left, right = map(escape, re.split(r'\|?\|?', msg, maxsplit=1)) + else: + left, right = '', escape(msg) + + # Update description + ltext.value = left + # never clear the bar (signal: msg='') + if right: + rtext.value = right + + # Change bar style + if bar_style: + # Hack-ish way to avoid the danger bar_style being overridden by + # success because the bar gets closed after the error... + if pbar.bar_style != 'danger' or bar_style != 'success': + pbar.bar_style = bar_style + + # Special signal to close the bar + if close and pbar.bar_style != 'danger': # hide only if no error + try: + self.container.close() + except AttributeError: + self.container.visible = False + self.container.layout.visibility = 'hidden' # IPYW>=8 + + if check_delay and self.delay > 0 and not self.displayed: + display(self.container) + self.displayed = True + + @property + def colour(self): + if hasattr(self, 'container'): + return self.container.children[-2].style.bar_color + + @colour.setter + def colour(self, bar_color): + if hasattr(self, 'container'): + self.container.children[-2].style.bar_color = bar_color + + def __init__(self, *args, **kwargs): + """ + Supports the usual `tqdm.tqdm` parameters as well as those listed below. + + Parameters + ---------- + display : Whether to call `display(self.container)` immediately + [default: True]. + """ + kwargs = kwargs.copy() + # Setup default output + file_kwarg = kwargs.get('file', sys.stderr) + if file_kwarg is sys.stderr or file_kwarg is None: + kwargs['file'] = sys.stdout # avoid the red block in IPython + + # Initialize parent class + avoid printing by using gui=True + kwargs['gui'] = True + # convert disable = None to False + kwargs['disable'] = bool(kwargs.get('disable', False)) + colour = kwargs.pop('colour', None) + display_here = kwargs.pop('display', True) + super().__init__(*args, **kwargs) + if self.disable or not kwargs['gui']: + self.disp = lambda *_, **__: None + return + + # Get bar width + self.ncols = '100%' if self.dynamic_ncols else kwargs.get("ncols", None) + + # Replace with IPython progress bar display (with correct total) + unit_scale = 1 if self.unit_scale is True else self.unit_scale or 1 + total = self.total * unit_scale if self.total else self.total + self.container = self.status_printer(self.fp, total, self.desc, self.ncols) + self.container.pbar = proxy(self) + self.displayed = False + if display_here and self.delay <= 0: + display(self.container) + self.displayed = True + self.disp = self.display + self.colour = colour + + # Print initial bar state + if not self.disable: + self.display(check_delay=False) + + def __iter__(self): + try: + it = super().__iter__() + for obj in it: + # return super(tqdm...) will not catch exception + yield obj + # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt + except: # NOQA + self.disp(bar_style='danger') + raise + # NB: don't `finally: close()` + # since this could be a shared bar which the user will `reset()` + + def update(self, n=1): + try: + return super().update(n=n) + # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt + except: # NOQA + # cannot catch KeyboardInterrupt when using manual tqdm + # as the interrupt will most likely happen on another statement + self.disp(bar_style='danger') + raise + # NB: don't `finally: close()` + # since this could be a shared bar which the user will `reset()` + + def close(self): + if self.disable: + return + super().close() + # Try to detect if there was an error or KeyboardInterrupt + # in manual mode: if n < total, things probably got wrong + if self.total and self.n < self.total: + self.disp(bar_style='danger', check_delay=False) + else: + if self.leave: + self.disp(bar_style='success', check_delay=False) + else: + self.disp(close=True, check_delay=False) + + def clear(self, *_, **__): + pass + + def reset(self, total=None): + """ + Resets to 0 iterations for repeated use. + + Consider combining with `leave=True`. + + Parameters + ---------- + total : int or float, optional. Total to use for the new bar. + """ + if self.disable: + return super().reset(total=total) + _, pbar, _ = self.container.children + pbar.bar_style = '' + if total is not None: + pbar.max = total + if not self.total and self.ncols is None: # no longer unknown total + pbar.layout.width = None # reset width + return super().reset(total=total) + + +def tnrange(*args, **kwargs): + """Shortcut for `tqdm.notebook.tqdm(range(*args), **kwargs)`.""" + return tqdm_notebook(range(*args), **kwargs) + + +# Aliases +tqdm = tqdm_notebook +trange = tnrange diff --git a/env/lib/python3.13/site-packages/tqdm/rich.py b/env/lib/python3.13/site-packages/tqdm/rich.py new file mode 100644 index 0000000000000000000000000000000000000000..3d392edaf115a93f7c145de52cbe8978dcf1ede8 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/rich.py @@ -0,0 +1,151 @@ +""" +`rich.progress` decorator for iterators. + +Usage: +>>> from tqdm.rich import trange, tqdm +>>> for i in trange(10): +... ... +""" +from warnings import warn + +from rich.progress import ( + BarColumn, Progress, ProgressColumn, Text, TimeElapsedColumn, TimeRemainingColumn, filesize) + +from .std import TqdmExperimentalWarning +from .std import tqdm as std_tqdm + +__author__ = {"github.com/": ["casperdcl"]} +__all__ = ['tqdm_rich', 'trrange', 'tqdm', 'trange'] + + +class FractionColumn(ProgressColumn): + """Renders completed/total, e.g. '0.5/2.3 G'.""" + def __init__(self, unit_scale=False, unit_divisor=1000): + self.unit_scale = unit_scale + self.unit_divisor = unit_divisor + super().__init__() + + def render(self, task): + """Calculate common unit for completed and total.""" + completed = int(task.completed) + total = int(task.total) + if self.unit_scale: + unit, suffix = filesize.pick_unit_and_suffix( + total, + ["", "K", "M", "G", "T", "P", "E", "Z", "Y"], + self.unit_divisor, + ) + else: + unit, suffix = filesize.pick_unit_and_suffix(total, [""], 1) + precision = 0 if unit == 1 else 1 + return Text( + f"{completed/unit:,.{precision}f}/{total/unit:,.{precision}f} {suffix}", + style="progress.download") + + +class RateColumn(ProgressColumn): + """Renders human readable transfer speed.""" + def __init__(self, unit="", unit_scale=False, unit_divisor=1000): + self.unit = unit + self.unit_scale = unit_scale + self.unit_divisor = unit_divisor + super().__init__() + + def render(self, task): + """Show data transfer speed.""" + speed = task.speed + if speed is None: + return Text(f"? {self.unit}/s", style="progress.data.speed") + if self.unit_scale: + unit, suffix = filesize.pick_unit_and_suffix( + speed, + ["", "K", "M", "G", "T", "P", "E", "Z", "Y"], + self.unit_divisor, + ) + else: + unit, suffix = filesize.pick_unit_and_suffix(speed, [""], 1) + precision = 0 if unit == 1 else 1 + return Text(f"{speed/unit:,.{precision}f} {suffix}{self.unit}/s", + style="progress.data.speed") + + +class tqdm_rich(std_tqdm): # pragma: no cover + """Experimental rich.progress GUI version of tqdm!""" + # TODO: @classmethod: write()? + def __init__(self, *args, **kwargs): + """ + This class accepts the following parameters *in addition* to + the parameters accepted by `tqdm`. + + Parameters + ---------- + progress : tuple, optional + arguments for `rich.progress.Progress()`. + options : dict, optional + keyword arguments for `rich.progress.Progress()`. + """ + kwargs = kwargs.copy() + kwargs['gui'] = True + # convert disable = None to False + kwargs['disable'] = bool(kwargs.get('disable', False)) + progress = kwargs.pop('progress', None) + options = kwargs.pop('options', {}).copy() + super().__init__(*args, **kwargs) + + if self.disable: + return + + warn("rich is experimental/alpha", TqdmExperimentalWarning, stacklevel=2) + d = self.format_dict + if progress is None: + progress = ( + "[progress.description]{task.description}" + "[progress.percentage]{task.percentage:>4.0f}%", + BarColumn(bar_width=None), + FractionColumn( + unit_scale=d['unit_scale'], unit_divisor=d['unit_divisor']), + "[", TimeElapsedColumn(), "<", TimeRemainingColumn(), + ",", RateColumn(unit=d['unit'], unit_scale=d['unit_scale'], + unit_divisor=d['unit_divisor']), "]" + ) + options.setdefault('transient', not self.leave) + self._prog = Progress(*progress, **options) + self._prog.__enter__() + self._task_id = self._prog.add_task(self.desc or "", **d) + + def close(self): + if self.disable: + return + self.display() # print 100%, vis #1306 + super().close() + self._prog.__exit__(None, None, None) + + def clear(self, *_, **__): + pass + + def display(self, *_, **__): + if not hasattr(self, '_prog'): + return + self._prog.update(self._task_id, completed=self.n, description=self.desc) + + def reset(self, total=None): + """ + Resets to 0 iterations for repeated use. + + Parameters + ---------- + total : int or float, optional. Total to use for the new bar. + """ + if hasattr(self, '_prog'): + self._prog.reset(total=total) + super().reset(total=total) + + +def trrange(*args, **kwargs): + """Shortcut for `tqdm.rich.tqdm(range(*args), **kwargs)`.""" + return tqdm_rich(range(*args), **kwargs) + + +# Aliases +tqdm = tqdm_rich +trange = trrange diff --git a/env/lib/python3.13/site-packages/tqdm/std.py b/env/lib/python3.13/site-packages/tqdm/std.py new file mode 100644 index 0000000000000000000000000000000000000000..e91ad3090392916fc2bc1e34bc471e43212fe699 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/std.py @@ -0,0 +1,1524 @@ +""" +Customisable progressbar decorator for iterators. +Includes a default `range` iterator printing to `stderr`. + +Usage: +>>> from tqdm import trange, tqdm +>>> for i in trange(10): +... ... +""" +import sys +from collections import OrderedDict, defaultdict +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from numbers import Number +from time import time +from warnings import warn +from weakref import WeakSet + +from ._monitor import TMonitor +from .utils import ( + CallbackIOWrapper, Comparable, DisableOnWriteError, FormatReplace, SimpleTextIOWrapper, + _is_ascii, _screen_shape_wrapper, _supports_unicode, _term_move_up, disp_len, disp_trim, + envwrap) + +__author__ = "https://github.com/tqdm/tqdm#contributions" +__all__ = ['tqdm', 'trange', + 'TqdmTypeError', 'TqdmKeyError', 'TqdmWarning', + 'TqdmExperimentalWarning', 'TqdmDeprecationWarning', + 'TqdmMonitorWarning'] + + +class TqdmTypeError(TypeError): + pass + + +class TqdmKeyError(KeyError): + pass + + +class TqdmWarning(Warning): + """base class for all tqdm warnings. + + Used for non-external-code-breaking errors, such as garbled printing. + """ + def __init__(self, msg, fp_write=None, *a, **k): + if fp_write is not None: + fp_write("\n" + self.__class__.__name__ + ": " + str(msg).rstrip() + '\n') + else: + super().__init__(msg, *a, **k) + + +class TqdmExperimentalWarning(TqdmWarning, FutureWarning): + """beta feature, unstable API and behaviour""" + pass + + +class TqdmDeprecationWarning(TqdmWarning, DeprecationWarning): + # not suppressed if raised + pass + + +class TqdmMonitorWarning(TqdmWarning, RuntimeWarning): + """tqdm monitor errors which do not affect external functionality""" + pass + + +def TRLock(*args, **kwargs): + """threading RLock""" + try: + from threading import RLock + return RLock(*args, **kwargs) + except (ImportError, OSError): # pragma: no cover + pass + + +class TqdmDefaultWriteLock(object): + """ + Provide a default write lock for thread and multiprocessing safety. + Works only on platforms supporting `fork` (so Windows is excluded). + You must initialise a `tqdm` or `TqdmDefaultWriteLock` instance + before forking in order for the write lock to work. + On Windows, you need to supply the lock from the parent to the children as + an argument to joblib or the parallelism lib you use. + """ + # global thread lock so no setup required for multithreading. + # NB: Do not create multiprocessing lock as it sets the multiprocessing + # context, disallowing `spawn()`/`forkserver()` + th_lock = TRLock() + + def __init__(self): + # Create global parallelism locks to avoid racing issues with parallel + # bars works only if fork available (Linux/MacOSX, but not Windows) + cls = type(self) + root_lock = cls.th_lock + if root_lock is not None: + root_lock.acquire() + cls.create_mp_lock() + self.locks = [lk for lk in [cls.mp_lock, cls.th_lock] if lk is not None] + if root_lock is not None: + root_lock.release() + + def acquire(self, *a, **k): + for lock in self.locks: + lock.acquire(*a, **k) + + def release(self): + for lock in self.locks[::-1]: # Release in inverse order of acquisition + lock.release() + + def __enter__(self): + self.acquire() + + def __exit__(self, *exc): + self.release() + + @classmethod + def create_mp_lock(cls): + if not hasattr(cls, 'mp_lock'): + try: + from multiprocessing import RLock + cls.mp_lock = RLock() + except (ImportError, OSError): # pragma: no cover + cls.mp_lock = None + + @classmethod + def create_th_lock(cls): + assert hasattr(cls, 'th_lock') + warn("create_th_lock not needed anymore", TqdmDeprecationWarning, stacklevel=2) + + +class Bar(object): + """ + `str.format`-able bar with format specifiers: `[width][type]` + + - `width` + + unspecified (default): use `self.default_len` + + `int >= 0`: overrides `self.default_len` + + `int < 0`: subtract from `self.default_len` + - `type` + + `a`: ascii (`charset=self.ASCII` override) + + `u`: unicode (`charset=self.UTF` override) + + `b`: blank (`charset=" "` override) + """ + ASCII = " 123456789#" + UTF = u" " + u''.join(map(chr, range(0x258F, 0x2587, -1))) + BLANK = " " + COLOUR_RESET = '\x1b[0m' + COLOUR_RGB = '\x1b[38;2;%d;%d;%dm' + COLOURS = {'BLACK': '\x1b[30m', 'RED': '\x1b[31m', 'GREEN': '\x1b[32m', + 'YELLOW': '\x1b[33m', 'BLUE': '\x1b[34m', 'MAGENTA': '\x1b[35m', + 'CYAN': '\x1b[36m', 'WHITE': '\x1b[37m'} + + def __init__(self, frac, default_len=10, charset=UTF, colour=None): + if not 0 <= frac <= 1: + warn("clamping frac to range [0, 1]", TqdmWarning, stacklevel=2) + frac = max(0, min(1, frac)) + assert default_len > 0 + self.frac = frac + self.default_len = default_len + self.charset = charset + self.colour = colour + + @property + def colour(self): + return self._colour + + @colour.setter + def colour(self, value): + if not value: + self._colour = None + return + try: + if value.upper() in self.COLOURS: + self._colour = self.COLOURS[value.upper()] + elif value[0] == '#' and len(value) == 7: + self._colour = self.COLOUR_RGB % tuple( + int(i, 16) for i in (value[1:3], value[3:5], value[5:7])) + else: + raise KeyError + except (KeyError, AttributeError): + warn("Unknown colour (%s); valid choices: [hex (#00ff00), %s]" % ( + value, ", ".join(self.COLOURS)), + TqdmWarning, stacklevel=2) + self._colour = None + + def __format__(self, format_spec): + if format_spec: + _type = format_spec[-1].lower() + try: + charset = {'a': self.ASCII, 'u': self.UTF, 'b': self.BLANK}[_type] + except KeyError: + charset = self.charset + else: + format_spec = format_spec[:-1] + if format_spec: + N_BARS = int(format_spec) + if N_BARS < 0: + N_BARS += self.default_len + else: + N_BARS = self.default_len + else: + charset = self.charset + N_BARS = self.default_len + + nsyms = len(charset) - 1 + bar_length, frac_bar_length = divmod(int(self.frac * N_BARS * nsyms), nsyms) + + res = charset[-1] * bar_length + if bar_length < N_BARS: # whitespace padding + res = res + charset[frac_bar_length] + charset[0] * (N_BARS - bar_length - 1) + return self.colour + res + self.COLOUR_RESET if self.colour else res + + +class EMA(object): + """ + Exponential moving average: smoothing to give progressively lower + weights to older values. + + Parameters + ---------- + smoothing : float, optional + Smoothing factor in range [0, 1], [default: 0.3]. + Increase to give more weight to recent values. + Ranges from 0 (yields old value) to 1 (yields new value). + """ + def __init__(self, smoothing=0.3): + self.alpha = smoothing + self.last = 0 + self.calls = 0 + + def __call__(self, x=None): + """ + Parameters + ---------- + x : float + New value to include in EMA. + """ + beta = 1 - self.alpha + if x is not None: + self.last = self.alpha * x + beta * self.last + self.calls += 1 + return self.last / (1 - beta ** self.calls) if self.calls else self.last + + +class tqdm(Comparable): + """ + Decorate an iterable object, returning an iterator which acts exactly + like the original iterable, but prints a dynamically updating + progressbar every time a value is requested. + + Parameters + ---------- + iterable : iterable, optional + Iterable to decorate with a progressbar. + Leave blank to manually manage the updates. + desc : str, optional + Prefix for the progressbar. + total : int or float, optional + The number of expected iterations. If unspecified, + len(iterable) is used if possible. If float("inf") or as a last + resort, only basic progress statistics are displayed + (no ETA, no progressbar). + If `gui` is True and this parameter needs subsequent updating, + specify an initial arbitrary large positive number, + e.g. 9e9. + leave : bool, optional + If [default: True], keeps all traces of the progressbar + upon termination of iteration. + If `None`, will leave only if `position` is `0`. + file : `io.TextIOWrapper` or `io.StringIO`, optional + Specifies where to output the progress messages + (default: sys.stderr). Uses `file.write(str)` and `file.flush()` + methods. For encoding, see `write_bytes`. + ncols : int, optional + The width of the entire output message. If specified, + dynamically resizes the progressbar to stay within this bound. + If unspecified, attempts to use environment width. The + fallback is a meter width of 10 and no limit for the counter and + statistics. If 0, will not print any meter (only stats). + mininterval : float, optional + Minimum progress display update interval [default: 0.1] seconds. + maxinterval : float, optional + Maximum progress display update interval [default: 10] seconds. + Automatically adjusts `miniters` to correspond to `mininterval` + after long display update lag. Only works if `dynamic_miniters` + or monitor thread is enabled. + miniters : int or float, optional + Minimum progress display update interval, in iterations. + If 0 and `dynamic_miniters`, will automatically adjust to equal + `mininterval` (more CPU efficient, good for tight loops). + If > 0, will skip display of specified number of iterations. + Tweak this and `mininterval` to get very efficient loops. + If your progress is erratic with both fast and slow iterations + (network, skipping items, etc) you should set miniters=1. + ascii : bool or str, optional + If unspecified or False, use unicode (smooth blocks) to fill + the meter. The fallback is to use ASCII characters " 123456789#". + disable : bool, optional + Whether to disable the entire progressbar wrapper + [default: False]. If set to None, disable on non-TTY. + unit : str, optional + String that will be used to define the unit of each iteration + [default: it]. + unit_scale : bool or int or float, optional + If 1 or True, the number of iterations will be reduced/scaled + automatically and a metric prefix following the + International System of Units standard will be added + (kilo, mega, etc.) [default: False]. If any other non-zero + number, will scale `total` and `n`. + dynamic_ncols : bool, optional + If set, constantly alters `ncols` and `nrows` to the + environment (allowing for window resizes) [default: False]. + smoothing : float, optional + Exponential moving average smoothing factor for speed estimates + (ignored in GUI mode). Ranges from 0 (average speed) to 1 + (current/instantaneous speed) [default: 0.3]. + bar_format : str, optional + Specify a custom bar string formatting. May impact performance. + [default: '{l_bar}{bar}{r_bar}'], where + l_bar='{desc}: {percentage:3.0f}%|' and + r_bar='| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ' + '{rate_fmt}{postfix}]' + Possible vars: l_bar, bar, r_bar, n, n_fmt, total, total_fmt, + percentage, elapsed, elapsed_s, ncols, nrows, desc, unit, + rate, rate_fmt, rate_noinv, rate_noinv_fmt, + rate_inv, rate_inv_fmt, postfix, unit_divisor, + remaining, remaining_s, eta. + Note that a trailing ": " is automatically removed after {desc} + if the latter is empty. + initial : int or float, optional + The initial counter value. Useful when restarting a progress + bar [default: 0]. If using float, consider specifying `{n:.3f}` + or similar in `bar_format`, or specifying `unit_scale`. + position : int, optional + Specify the line offset to print this bar (starting from 0) + Automatic if unspecified. + Useful to manage multiple bars at once (eg, from threads). + postfix : dict or *, optional + Specify additional stats to display at the end of the bar. + Calls `set_postfix(**postfix)` if possible (dict). + unit_divisor : float, optional + [default: 1000], ignored unless `unit_scale` is True. + write_bytes : bool, optional + Whether to write bytes. If (default: False) will write unicode. + lock_args : tuple, optional + Passed to `refresh` for intermediate output + (initialisation, iterating, and updating). + nrows : int, optional + The screen height. If specified, hides nested bars outside this + bound. If unspecified, attempts to use environment height. + The fallback is 20. + colour : str, optional + Bar colour (e.g. 'green', '#00ff00'). + delay : float, optional + Don't display until [default: 0] seconds have elapsed. + gui : bool, optional + WARNING: internal parameter - do not use. + Use tqdm.gui.tqdm(...) instead. If set, will attempt to use + matplotlib animations for a graphical output [default: False]. + + Returns + ------- + out : decorated iterator. + """ + + monitor_interval = 10 # set to 0 to disable the thread + monitor = None + _instances = WeakSet() + + @staticmethod + def format_sizeof(num, suffix='', divisor=1000): + """ + Formats a number (greater than unity) with SI Order of Magnitude + prefixes. + + Parameters + ---------- + num : float + Number ( >= 1) to format. + suffix : str, optional + Post-postfix [default: '']. + divisor : float, optional + Divisor between prefixes [default: 1000]. + + Returns + ------- + out : str + Number with Order of Magnitude SI unit postfix. + """ + for unit in ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 999.5: + if abs(num) < 99.95: + if abs(num) < 9.995: + return f'{num:1.2f}{unit}{suffix}' + return f'{num:2.1f}{unit}{suffix}' + return f'{num:3.0f}{unit}{suffix}' + num /= divisor + return f'{num:3.1f}Y{suffix}' + + @staticmethod + def format_interval(t): + """ + Formats a number of seconds as a clock time, [H:]MM:SS + + Parameters + ---------- + t : int + Number of seconds. + + Returns + ------- + out : str + [H:]MM:SS + """ + mins, s = divmod(int(t), 60) + h, m = divmod(mins, 60) + return f'{h:d}:{m:02d}:{s:02d}' if h else f'{m:02d}:{s:02d}' + + @staticmethod + def format_num(n): + """ + Intelligent scientific notation (.3g). + + Parameters + ---------- + n : int or float or Numeric + A Number. + + Returns + ------- + out : str + Formatted number. + """ + f = f'{n:.3g}'.replace('e+0', 'e+').replace('e-0', 'e-') + n = str(n) + return f if len(f) < len(n) else n + + @staticmethod + def status_printer(file): + """ + Manage the printing and in-place updating of a line of characters. + Note that if the string is longer than a line, then in-place + updating may not work (it will print a new line at each refresh). + """ + fp = file + fp_flush = getattr(fp, 'flush', lambda: None) # pragma: no cover + if fp in (sys.stderr, sys.stdout): + getattr(sys.stderr, 'flush', lambda: None)() + getattr(sys.stdout, 'flush', lambda: None)() + + def fp_write(s): + fp.write(str(s)) + fp_flush() + + last_len = [0] + + def print_status(s): + len_s = disp_len(s) + fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0))) + last_len[0] = len_s + + return print_status + + @staticmethod + def format_meter(n, total, elapsed, ncols=None, prefix='', ascii=False, unit='it', + unit_scale=False, rate=None, bar_format=None, postfix=None, + unit_divisor=1000, initial=0, colour=None, **extra_kwargs): + """ + Return a string-based progress bar given some parameters + + Parameters + ---------- + n : int or float + Number of finished iterations. + total : int or float + The expected total number of iterations. If meaningless (None), + only basic progress statistics are displayed (no ETA). + elapsed : float + Number of seconds passed since start. + ncols : int, optional + The width of the entire output message. If specified, + dynamically resizes `{bar}` to stay within this bound + [default: None]. If `0`, will not print any bar (only stats). + The fallback is `{bar:10}`. + prefix : str, optional + Prefix message (included in total width) [default: '']. + Use as {desc} in bar_format string. + ascii : bool, optional or str, optional + If not set, use unicode (smooth blocks) to fill the meter + [default: False]. The fallback is to use ASCII characters + " 123456789#". + unit : str, optional + The iteration unit [default: 'it']. + unit_scale : bool or int or float, optional + If 1 or True, the number of iterations will be printed with an + appropriate SI metric prefix (k = 10^3, M = 10^6, etc.) + [default: False]. If any other non-zero number, will scale + `total` and `n`. + rate : float, optional + Manual override for iteration rate. + If [default: None], uses n/elapsed. + bar_format : str, optional + Specify a custom bar string formatting. May impact performance. + [default: '{l_bar}{bar}{r_bar}'], where + l_bar='{desc}: {percentage:3.0f}%|' and + r_bar='| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ' + '{rate_fmt}{postfix}]' + Possible vars: l_bar, bar, r_bar, n, n_fmt, total, total_fmt, + percentage, elapsed, elapsed_s, ncols, nrows, desc, unit, + rate, rate_fmt, rate_noinv, rate_noinv_fmt, + rate_inv, rate_inv_fmt, postfix, unit_divisor, + remaining, remaining_s, eta. + Note that a trailing ": " is automatically removed after {desc} + if the latter is empty. + postfix : *, optional + Similar to `prefix`, but placed at the end + (e.g. for additional stats). + Note: postfix is usually a string (not a dict) for this method, + and will if possible be set to postfix = ', ' + postfix. + However other types are supported (#382). + unit_divisor : float, optional + [default: 1000], ignored unless `unit_scale` is True. + initial : int or float, optional + The initial counter value [default: 0]. + colour : str, optional + Bar colour (e.g. 'green', '#00ff00'). + + Returns + ------- + out : Formatted meter and stats, ready to display. + """ + + # sanity check: total + if total and n >= (total + 0.5): # allow float imprecision (#849) + total = None + + # apply custom scale if necessary + if unit_scale and unit_scale not in (True, 1): + if total: + total *= unit_scale + n *= unit_scale + if rate: + rate *= unit_scale # by default rate = self.avg_dn / self.avg_dt + unit_scale = False + + elapsed_str = tqdm.format_interval(elapsed) + + # if unspecified, attempt to use rate = average speed + # (we allow manual override since predicting time is an arcane art) + if rate is None and elapsed: + rate = (n - initial) / elapsed + inv_rate = 1 / rate if rate else None + format_sizeof = tqdm.format_sizeof + rate_noinv_fmt = ((format_sizeof(rate) if unit_scale else f'{rate:5.2f}') + if rate else '?') + unit + '/s' + rate_inv_fmt = ( + (format_sizeof(inv_rate) if unit_scale else f'{inv_rate:5.2f}') + if inv_rate else '?') + 's/' + unit + rate_fmt = rate_inv_fmt if inv_rate and inv_rate > 1 else rate_noinv_fmt + + if unit_scale: + n_fmt = format_sizeof(n, divisor=unit_divisor) + total_fmt = format_sizeof(total, divisor=unit_divisor) if total is not None else '?' + else: + n_fmt = str(n) + total_fmt = str(total) if total is not None else '?' + + try: + postfix = ', ' + postfix if postfix else '' + except TypeError: + pass + + remaining = (total - n) / rate if rate and total else 0 + remaining_str = tqdm.format_interval(remaining) if rate else '?' + try: + eta_dt = (datetime.now() + timedelta(seconds=remaining) + if rate and total else datetime.fromtimestamp(0, timezone.utc)) + except OverflowError: + eta_dt = datetime.max + + # format the stats displayed to the left and right sides of the bar + if prefix: + # old prefix setup work around + bool_prefix_colon_already = (prefix[-2:] == ": ") + l_bar = prefix if bool_prefix_colon_already else prefix + ": " + else: + l_bar = '' + + r_bar = f'| {n_fmt}/{total_fmt} [{elapsed_str}<{remaining_str}, {rate_fmt}{postfix}]' + + # Custom bar formatting + # Populate a dict with all available progress indicators + format_dict = { + # slight extension of self.format_dict + 'n': n, 'n_fmt': n_fmt, 'total': total, 'total_fmt': total_fmt, + 'elapsed': elapsed_str, 'elapsed_s': elapsed, + 'ncols': ncols, 'desc': prefix or '', 'unit': unit, + 'rate': inv_rate if inv_rate and inv_rate > 1 else rate, + 'rate_fmt': rate_fmt, 'rate_noinv': rate, + 'rate_noinv_fmt': rate_noinv_fmt, 'rate_inv': inv_rate, + 'rate_inv_fmt': rate_inv_fmt, + 'postfix': postfix, 'unit_divisor': unit_divisor, + 'colour': colour, + # plus more useful definitions + 'remaining': remaining_str, 'remaining_s': remaining, + 'l_bar': l_bar, 'r_bar': r_bar, 'eta': eta_dt, + **extra_kwargs} + + # total is known: we can predict some stats + if total: + # fractional and percentage progress + frac = n / total + percentage = frac * 100 + + l_bar += f'{percentage:3.0f}%|' + + if ncols == 0: + return l_bar[:-1] + r_bar[1:] + + format_dict.update(l_bar=l_bar) + if bar_format: + format_dict.update(percentage=percentage) + + # auto-remove colon for empty `{desc}` + if not prefix: + bar_format = bar_format.replace("{desc}: ", '') + else: + bar_format = "{l_bar}{bar}{r_bar}" + + full_bar = FormatReplace() + nobar = bar_format.format(bar=full_bar, **format_dict) + if not full_bar.format_called: + return nobar # no `{bar}`; nothing else to do + + # Formatting progress bar space available for bar's display + full_bar = Bar(frac, + max(1, ncols - disp_len(nobar)) if ncols else 10, + charset=Bar.ASCII if ascii is True else ascii or Bar.UTF, + colour=colour) + if not _is_ascii(full_bar.charset) and _is_ascii(bar_format): + bar_format = str(bar_format) + res = bar_format.format(bar=full_bar, **format_dict) + return disp_trim(res, ncols) if ncols else res + + elif bar_format: + # user-specified bar_format but no total + l_bar += '|' + format_dict.update(l_bar=l_bar, percentage=0) + full_bar = FormatReplace() + nobar = bar_format.format(bar=full_bar, **format_dict) + if not full_bar.format_called: + return nobar + full_bar = Bar(0, + max(1, ncols - disp_len(nobar)) if ncols else 10, + charset=Bar.BLANK, colour=colour) + res = bar_format.format(bar=full_bar, **format_dict) + return disp_trim(res, ncols) if ncols else res + else: + # no total: no progressbar, ETA, just progress stats + return (f'{(prefix + ": ") if prefix else ""}' + f'{n_fmt}{unit} [{elapsed_str}, {rate_fmt}{postfix}]') + + def __new__(cls, *_, **__): + instance = object.__new__(cls) + with cls.get_lock(): # also constructs lock if non-existent + cls._instances.add(instance) + # create monitoring thread + if cls.monitor_interval and (cls.monitor is None + or not cls.monitor.report()): + try: + cls.monitor = TMonitor(cls, cls.monitor_interval) + except Exception as e: # pragma: nocover + warn("tqdm:disabling monitor support" + " (monitor_interval = 0) due to:\n" + str(e), + TqdmMonitorWarning, stacklevel=2) + cls.monitor_interval = 0 + return instance + + @classmethod + def _get_free_pos(cls, instance=None): + """Skips specified instance.""" + positions = {abs(inst.pos) for inst in cls._instances + if inst is not instance and hasattr(inst, "pos")} + return min(set(range(len(positions) + 1)).difference(positions)) + + @classmethod + def _decr_instances(cls, instance): + """ + Remove from list and reposition another unfixed bar + to fill the new gap. + + This means that by default (where all nested bars are unfixed), + order is not maintained but screen flicker/blank space is minimised. + (tqdm<=4.44.1 moved ALL subsequent unfixed bars up.) + """ + with cls._lock: + try: + cls._instances.remove(instance) + except KeyError: + # if not instance.gui: # pragma: no cover + # raise + pass # py2: maybe magically removed already + # else: + if not instance.gui: + last = (instance.nrows or 20) - 1 + # find unfixed (`pos >= 0`) overflow (`pos >= nrows - 1`) + instances = list(filter( + lambda i: hasattr(i, "pos") and last <= i.pos, + cls._instances)) + # set first found to current `pos` + if instances: + inst = min(instances, key=lambda i: i.pos) + inst.clear(nolock=True) + inst.pos = abs(instance.pos) + + @classmethod + def write(cls, s, file=None, end="\n", nolock=False): + """Print a message via tqdm (without overlap with bars).""" + fp = file if file is not None else sys.stdout + with cls.external_write_mode(file=file, nolock=nolock): + # Write the message + fp.write(s) + fp.write(end) + + @classmethod + @contextmanager + def external_write_mode(cls, file=None, nolock=False): + """ + Disable tqdm within context and refresh tqdm when exits. + Useful when writing to standard output stream + """ + fp = file if file is not None else sys.stdout + + try: + if not nolock: + cls.get_lock().acquire() + # Clear all bars + inst_cleared = [] + for inst in getattr(cls, '_instances', []): + # Clear instance if in the target output file + # or if write output + tqdm output are both either + # sys.stdout or sys.stderr (because both are mixed in terminal) + if hasattr(inst, "start_t") and (inst.fp == fp or all( + f in (sys.stdout, sys.stderr) for f in (fp, inst.fp))): + inst.clear(nolock=True) + inst_cleared.append(inst) + yield + # Force refresh display of bars we cleared + for inst in inst_cleared: + inst.refresh(nolock=True) + finally: + if not nolock: + cls._lock.release() + + @classmethod + def set_lock(cls, lock): + """Set the global lock.""" + cls._lock = lock + + @classmethod + def get_lock(cls): + """Get the global lock. Construct it if it does not exist.""" + if not hasattr(cls, '_lock'): + cls._lock = TqdmDefaultWriteLock() + return cls._lock + + @classmethod + def pandas(cls, **tqdm_kwargs): + """ + Registers the current `tqdm` class with + pandas.core. + ( frame.DataFrame + | series.Series + | groupby.(generic.)DataFrameGroupBy + | groupby.(generic.)SeriesGroupBy + ).progress_apply + + A new instance will be created every time `progress_apply` is called, + and each instance will automatically `close()` upon completion. + + Parameters + ---------- + tqdm_kwargs : arguments for the tqdm instance + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from tqdm import tqdm + >>> from tqdm.gui import tqdm as tqdm_gui + >>> + >>> df = pd.DataFrame(np.random.randint(0, 100, (100000, 6))) + >>> tqdm.pandas(ncols=50) # can use tqdm_gui, optional kwargs, etc + >>> # Now you can use `progress_apply` instead of `apply` + >>> df.groupby(0).progress_apply(lambda x: x**2) + + References + ---------- + + """ + from warnings import catch_warnings, simplefilter + + from pandas.core.frame import DataFrame + from pandas.core.series import Series + try: + with catch_warnings(): + simplefilter("ignore", category=FutureWarning) + from pandas import Panel + except ImportError: # pandas>=1.2.0 + Panel = None + Rolling, Expanding = None, None + try: # pandas>=1.0.0 + from pandas.core.window.rolling import _Rolling_and_Expanding + except ImportError: + try: # pandas>=0.18.0 + from pandas.core.window import _Rolling_and_Expanding + except ImportError: # pandas>=1.2.0 + try: # pandas>=1.2.0 + from pandas.core.window.expanding import Expanding + from pandas.core.window.rolling import Rolling + _Rolling_and_Expanding = Rolling, Expanding + except ImportError: # pragma: no cover + _Rolling_and_Expanding = None + try: # pandas>=0.25.0 + from pandas.core.groupby.generic import SeriesGroupBy # , NDFrameGroupBy + from pandas.core.groupby.generic import DataFrameGroupBy + except ImportError: # pragma: no cover + try: # pandas>=0.23.0 + from pandas.core.groupby.groupby import DataFrameGroupBy, SeriesGroupBy + except ImportError: + from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy + try: # pandas>=0.23.0 + from pandas.core.groupby.groupby import GroupBy + except ImportError: # pragma: no cover + from pandas.core.groupby import GroupBy + + try: # pandas>=0.23.0 + from pandas.core.groupby.groupby import PanelGroupBy + except ImportError: + try: + from pandas.core.groupby import PanelGroupBy + except ImportError: # pandas>=0.25.0 + PanelGroupBy = None + + tqdm_kwargs = tqdm_kwargs.copy() + deprecated_t = [tqdm_kwargs.pop('deprecated_t', None)] + + def inner_generator(df_function='apply'): + def inner(df, func, *args, **kwargs): + """ + Parameters + ---------- + df : (DataFrame|Series)[GroupBy] + Data (may be grouped). + func : function + To be applied on the (grouped) data. + **kwargs : optional + Transmitted to `df.apply()`. + """ + + # Precompute total iterations + total = tqdm_kwargs.pop("total", getattr(df, 'ngroups', None)) + if total is None: # not grouped + if df_function == 'applymap': + total = df.size + elif isinstance(df, Series): + total = len(df) + elif (_Rolling_and_Expanding is None or + not isinstance(df, _Rolling_and_Expanding)): + # DataFrame or Panel + axis = kwargs.get('axis', 0) + if axis == 'index': + axis = 0 + elif axis == 'columns': + axis = 1 + # when axis=0, total is shape[axis1] + total = df.size // df.shape[axis] + + # Init bar + if deprecated_t[0] is not None: + t = deprecated_t[0] + deprecated_t[0] = None + else: + t = cls(total=total, **tqdm_kwargs) + + if len(args) > 0: + # *args intentionally not supported (see #244, #299) + TqdmDeprecationWarning( + "Except func, normal arguments are intentionally" + + " not supported by" + + " `(DataFrame|Series|GroupBy).progress_apply`." + + " Use keyword arguments instead.", + fp_write=getattr(t.fp, 'write', sys.stderr.write)) + + try: # pandas>=1.3.0 + from pandas.core.common import is_builtin_func + except ImportError: + is_builtin_func = df._is_builtin_func + try: + func = is_builtin_func(func) + except TypeError: + pass + + # Define bar updating wrapper + def wrapper(*args, **kwargs): + # update tbar correctly + # it seems `pandas apply` calls `func` twice + # on the first column/row to decide whether it can + # take a fast or slow code path; so stop when t.total==t.n + t.update(n=1 if not t.total or t.n < t.total else 0) + return func(*args, **kwargs) + + # Apply the provided function (in **kwargs) + # on the df using our wrapper (which provides bar updating) + try: + return getattr(df, df_function)(wrapper, **kwargs) + finally: + t.close() + + return inner + + # Monkeypatch pandas to provide easy methods + # Enable custom tqdm progress in pandas! + Series.progress_apply = inner_generator() + SeriesGroupBy.progress_apply = inner_generator() + Series.progress_map = inner_generator('map') + SeriesGroupBy.progress_map = inner_generator('map') + + DataFrame.progress_apply = inner_generator() + DataFrameGroupBy.progress_apply = inner_generator() + DataFrame.progress_applymap = inner_generator('applymap') + DataFrame.progress_map = inner_generator('map') + DataFrameGroupBy.progress_map = inner_generator('map') + + if Panel is not None: + Panel.progress_apply = inner_generator() + if PanelGroupBy is not None: + PanelGroupBy.progress_apply = inner_generator() + + GroupBy.progress_apply = inner_generator() + GroupBy.progress_aggregate = inner_generator('aggregate') + GroupBy.progress_transform = inner_generator('transform') + + if Rolling is not None and Expanding is not None: + Rolling.progress_apply = inner_generator() + Expanding.progress_apply = inner_generator() + elif _Rolling_and_Expanding is not None: + _Rolling_and_Expanding.progress_apply = inner_generator() + + # override defaults via env vars + @envwrap("TQDM_", is_method=True, types={'total': float, 'ncols': int, 'miniters': float, + 'position': int, 'nrows': int}) + def __init__(self, iterable=None, desc=None, total=None, leave=True, file=None, + ncols=None, mininterval=0.1, maxinterval=10.0, miniters=None, + ascii=None, disable=False, unit='it', unit_scale=False, + dynamic_ncols=False, smoothing=0.3, bar_format=None, initial=0, + position=None, postfix=None, unit_divisor=1000, write_bytes=False, + lock_args=None, nrows=None, colour=None, delay=0.0, gui=False, + **kwargs): + """see tqdm.tqdm for arguments""" + if file is None: + file = sys.stderr + + if write_bytes: + # Despite coercing unicode into bytes, py2 sys.std* streams + # should have bytes written to them. + file = SimpleTextIOWrapper( + file, encoding=getattr(file, 'encoding', None) or 'utf-8') + + file = DisableOnWriteError(file, tqdm_instance=self) + + if disable is None and hasattr(file, "isatty") and not file.isatty(): + disable = True + + if total is None and iterable is not None: + try: + total = len(iterable) + except (TypeError, AttributeError): + total = None + if total == float("inf"): + # Infinite iterations, behave same as unknown + total = None + + if disable: + self.iterable = iterable + self.disable = disable + with self._lock: + self.pos = self._get_free_pos(self) + self._instances.remove(self) + self.n = initial + self.total = total + self.leave = leave + return + + if kwargs: + self.disable = True + with self._lock: + self.pos = self._get_free_pos(self) + self._instances.remove(self) + raise ( + TqdmDeprecationWarning( + "`nested` is deprecated and automated.\n" + "Use `position` instead for manual control.\n", + fp_write=getattr(file, 'write', sys.stderr.write)) + if "nested" in kwargs else + TqdmKeyError("Unknown argument(s): " + str(kwargs))) + + # Preprocess the arguments + if ( + (ncols is None or nrows is None) and (file in (sys.stderr, sys.stdout)) + ) or dynamic_ncols: # pragma: no cover + if dynamic_ncols: + dynamic_ncols = _screen_shape_wrapper() + if dynamic_ncols: + ncols, nrows = dynamic_ncols(file) + else: + _dynamic_ncols = _screen_shape_wrapper() + if _dynamic_ncols: + _ncols, _nrows = _dynamic_ncols(file) + if ncols is None: + ncols = _ncols + if nrows is None: + nrows = _nrows + + if miniters is None: + miniters = 0 + dynamic_miniters = True + else: + dynamic_miniters = False + + if mininterval is None: + mininterval = 0 + + if maxinterval is None: + maxinterval = 0 + + if ascii is None: + ascii = not _supports_unicode(file) + + if bar_format and ascii is not True and not _is_ascii(ascii): + # Convert bar format into unicode since terminal uses unicode + bar_format = str(bar_format) + + if smoothing is None: + smoothing = 0 + + # Store the arguments + self.iterable = iterable + self.desc = desc or '' + self.total = total + self.leave = leave + self.fp = file + self.ncols = ncols + self.nrows = nrows + self.mininterval = mininterval + self.maxinterval = maxinterval + self.miniters = miniters + self.dynamic_miniters = dynamic_miniters + self.ascii = ascii + self.disable = disable + self.unit = unit + self.unit_scale = unit_scale + self.unit_divisor = unit_divisor + self.initial = initial + self.lock_args = lock_args + self.delay = delay + self.gui = gui + self.dynamic_ncols = dynamic_ncols + self.smoothing = smoothing + self._ema_dn = EMA(smoothing) + self._ema_dt = EMA(smoothing) + self._ema_miniters = EMA(smoothing) + self.bar_format = bar_format + self.postfix = None + self.colour = colour + self._time = time + if postfix: + try: + self.set_postfix(refresh=False, **postfix) + except TypeError: + self.postfix = postfix + + # Init the iterations counters + self.last_print_n = initial + self.n = initial + + # if nested, at initial sp() call we replace '\r' by '\n' to + # not overwrite the outer progress bar + with self._lock: + # mark fixed positions as negative + self.pos = self._get_free_pos(self) if position is None else -position + + if not gui: + # Initialize the screen printer + self.sp = self.status_printer(self.fp) + if delay <= 0: + self.refresh(lock_args=self.lock_args) + + # Init the time counter + self.last_print_t = self._time() + # NB: Avoid race conditions by setting start_t at the very end of init + self.start_t = self.last_print_t + + def __bool__(self): + if self.total is not None: + return self.total > 0 + if self.iterable is None: + raise TypeError('bool() undefined when iterable == total == None') + return bool(self.iterable) + + def __len__(self): + return ( + self.total if self.iterable is None + else self.iterable.shape[0] if hasattr(self.iterable, "shape") + else len(self.iterable) if hasattr(self.iterable, "__len__") + else self.iterable.__length_hint__() if hasattr(self.iterable, "__length_hint__") + else getattr(self, "total", None)) + + def __reversed__(self): + try: + orig = self.iterable + except AttributeError: + raise TypeError("'tqdm' object is not reversible") + else: + self.iterable = reversed(self.iterable) + return self.__iter__() + finally: + self.iterable = orig + + def __contains__(self, item): + contains = getattr(self.iterable, '__contains__', None) + return contains(item) if contains is not None else item in self.__iter__() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + try: + self.close() + except AttributeError: + # maybe eager thread cleanup upon external error + if (exc_type, exc_value, traceback) == (None, None, None): + raise + warn("AttributeError ignored", TqdmWarning, stacklevel=2) + + def __del__(self): + self.close() + + def __str__(self): + return self.format_meter(**self.format_dict) + + @property + def _comparable(self): + return abs(getattr(self, "pos", 1 << 31)) + + def __hash__(self): + return id(self) + + def __iter__(self): + """Backward-compatibility to use: for x in tqdm(iterable)""" + + # Inlining instance variables as locals (speed optimisation) + iterable = self.iterable + + # If the bar is disabled, then just walk the iterable + # (note: keep this check outside the loop for performance) + if self.disable: + for obj in iterable: + yield obj + return + + mininterval = self.mininterval + last_print_t = self.last_print_t + last_print_n = self.last_print_n + min_start_t = self.start_t + self.delay + n = self.n + time = self._time + + try: + for obj in iterable: + yield obj + # Update and possibly print the progressbar. + # Note: does not call self.update(1) for speed optimisation. + n += 1 + + if n - last_print_n >= self.miniters: + cur_t = time() + dt = cur_t - last_print_t + if dt >= mininterval and cur_t >= min_start_t: + self.update(n - last_print_n) + last_print_n = self.last_print_n + last_print_t = self.last_print_t + finally: + self.n = n + self.close() + + def update(self, n=1): + """ + Manually update the progress bar, useful for streams + such as reading files. + E.g.: + >>> t = tqdm(total=filesize) # Initialise + >>> for current_buffer in stream: + ... ... + ... t.update(len(current_buffer)) + >>> t.close() + The last line is highly recommended, but possibly not necessary if + `t.update()` will be called in such a way that `filesize` will be + exactly reached and printed. + + Parameters + ---------- + n : int or float, optional + Increment to add to the internal counter of iterations + [default: 1]. If using float, consider specifying `{n:.3f}` + or similar in `bar_format`, or specifying `unit_scale`. + + Returns + ------- + out : bool or None + True if a `display()` was triggered. + """ + if self.disable: + return + + if n < 0: + self.last_print_n += n # for auto-refresh logic to work + self.n += n + + # check counter first to reduce calls to time() + if self.n - self.last_print_n >= self.miniters: + cur_t = self._time() + dt = cur_t - self.last_print_t + if dt >= self.mininterval and cur_t >= self.start_t + self.delay: + cur_t = self._time() + dn = self.n - self.last_print_n # >= n + if self.smoothing and dt and dn: + # EMA (not just overall average) + self._ema_dn(dn) + self._ema_dt(dt) + self.refresh(lock_args=self.lock_args) + if self.dynamic_miniters: + # If no `miniters` was specified, adjust automatically to the + # maximum iteration rate seen so far between two prints. + # e.g.: After running `tqdm.update(5)`, subsequent + # calls to `tqdm.update()` will only cause an update after + # at least 5 more iterations. + if self.maxinterval and dt >= self.maxinterval: + self.miniters = dn * (self.mininterval or self.maxinterval) / dt + elif self.smoothing: + # EMA miniters update + self.miniters = self._ema_miniters( + dn * (self.mininterval / dt if self.mininterval and dt + else 1)) + else: + # max iters between two prints + self.miniters = max(self.miniters, dn) + + # Store old values for next call + self.last_print_n = self.n + self.last_print_t = cur_t + return True + + def close(self): + """Cleanup and (if leave=False) close the progressbar.""" + if self.disable: + return + + # Prevent multiple closures + self.disable = True + + # decrement instance pos and remove from internal set + pos = abs(self.pos) + self._decr_instances(self) + + if self.last_print_t < self.start_t + self.delay: + # haven't ever displayed; nothing to clear + return + + # GUI mode + if getattr(self, 'sp', None) is None: + return + + # annoyingly, _supports_unicode isn't good enough + def fp_write(s): + self.fp.write(str(s)) + + try: + fp_write('') + except ValueError as e: + if 'closed' in str(e): + return + raise # pragma: no cover + + leave = pos == 0 if self.leave is None else self.leave + + with self._lock: + if leave: + # stats for overall rate (no weighted average) + self._ema_dt = lambda: None + self.display(pos=0) + fp_write('\n') + else: + # clear previous display + if self.display(msg='', pos=pos) and not pos: + fp_write('\r') + + def clear(self, nolock=False): + """Clear current bar display.""" + if self.disable: + return + + if not nolock: + self._lock.acquire() + pos = abs(self.pos) + if pos < (self.nrows or 20): + self.moveto(pos) + self.sp('') + self.fp.write('\r') # place cursor back at the beginning of line + self.moveto(-pos) + if not nolock: + self._lock.release() + + def refresh(self, nolock=False, lock_args=None): + """ + Force refresh the display of this bar. + + Parameters + ---------- + nolock : bool, optional + If `True`, does not lock. + If [default: `False`]: calls `acquire()` on internal lock. + lock_args : tuple, optional + Passed to internal lock's `acquire()`. + If specified, will only `display()` if `acquire()` returns `True`. + """ + if self.disable: + return + + if not nolock: + if lock_args: + if not self._lock.acquire(*lock_args): + return False + else: + self._lock.acquire() + self.display() + if not nolock: + self._lock.release() + return True + + def unpause(self): + """Restart tqdm timer from last print time.""" + if self.disable: + return + cur_t = self._time() + self.start_t += cur_t - self.last_print_t + self.last_print_t = cur_t + + def reset(self, total=None): + """ + Resets to 0 iterations for repeated use. + + Consider combining with `leave=True`. + + Parameters + ---------- + total : int or float, optional. Total to use for the new bar. + """ + self.n = 0 + if total is not None: + self.total = total + if self.disable: + return + self.last_print_n = 0 + self.last_print_t = self.start_t = self._time() + self._ema_dn = EMA(self.smoothing) + self._ema_dt = EMA(self.smoothing) + self._ema_miniters = EMA(self.smoothing) + self.refresh() + + def set_description(self, desc=None, refresh=True): + """ + Set/modify description of the progress bar. + + Parameters + ---------- + desc : str, optional + refresh : bool, optional + Forces refresh [default: True]. + """ + self.desc = desc + ': ' if desc else '' + if refresh: + self.refresh() + + def set_description_str(self, desc=None, refresh=True): + """Set/modify description without ': ' appended.""" + self.desc = desc or '' + if refresh: + self.refresh() + + def set_postfix(self, ordered_dict=None, refresh=True, **kwargs): + """ + Set/modify postfix (additional stats) + with automatic formatting based on datatype. + + Parameters + ---------- + ordered_dict : dict or OrderedDict, optional + refresh : bool, optional + Forces refresh [default: True]. + kwargs : dict, optional + """ + # Sort in alphabetical order to be more deterministic + postfix = OrderedDict([] if ordered_dict is None else ordered_dict) + for key in sorted(kwargs.keys()): + postfix[key] = kwargs[key] + # Preprocess stats according to datatype + for key in postfix.keys(): + # Number: limit the length of the string + if isinstance(postfix[key], Number): + postfix[key] = self.format_num(postfix[key]) + # Else for any other type, try to get the string conversion + elif not isinstance(postfix[key], str): + postfix[key] = str(postfix[key]) + # Else if it's a string, don't need to preprocess anything + # Stitch together to get the final postfix + self.postfix = ', '.join(key + '=' + postfix[key].strip() + for key in postfix.keys()) + if refresh: + self.refresh() + + def set_postfix_str(self, s='', refresh=True): + """ + Postfix without dictionary expansion, similar to prefix handling. + """ + self.postfix = str(s) + if refresh: + self.refresh() + + def moveto(self, n): + # TODO: private method + self.fp.write('\n' * n + _term_move_up() * -n) + getattr(self.fp, 'flush', lambda: None)() + + @property + def format_dict(self): + """Public API for read-only member access.""" + if self.disable and not hasattr(self, 'unit'): + return defaultdict(lambda: None, { + 'n': self.n, 'total': self.total, 'elapsed': 0, 'unit': 'it'}) + if self.dynamic_ncols: + self.ncols, self.nrows = self.dynamic_ncols(self.fp) + return { + 'n': self.n, 'total': self.total, + 'elapsed': self._time() - self.start_t if hasattr(self, 'start_t') else 0, + 'ncols': self.ncols, 'nrows': self.nrows, 'prefix': self.desc, + 'ascii': self.ascii, 'unit': self.unit, 'unit_scale': self.unit_scale, + 'rate': self._ema_dn() / self._ema_dt() if self._ema_dt() else None, + 'bar_format': self.bar_format, 'postfix': self.postfix, + 'unit_divisor': self.unit_divisor, 'initial': self.initial, + 'colour': self.colour} + + def display(self, msg=None, pos=None): + """ + Use `self.sp` to display `msg` in the specified `pos`. + + Consider overloading this function when inheriting to use e.g.: + `self.some_frontend(**self.format_dict)` instead of `self.sp`. + + Parameters + ---------- + msg : str, optional. What to display (default: `repr(self)`). + pos : int, optional. Position to `moveto` + (default: `abs(self.pos)`). + """ + if pos is None: + pos = abs(self.pos) + + nrows = self.nrows or 20 + if pos >= nrows - 1: + if pos >= nrows: + return False + if msg or msg is None: # override at `nrows - 1` + msg = " ... (more hidden) ..." + + if not hasattr(self, "sp"): + raise TqdmDeprecationWarning( + "Please use `tqdm.gui.tqdm(...)`" + " instead of `tqdm(..., gui=True)`\n", + fp_write=getattr(self.fp, 'write', sys.stderr.write)) + + if pos: + self.moveto(pos) + self.sp(self.__str__() if msg is None else msg) + if pos: + self.moveto(-pos) + return True + + @classmethod + @contextmanager + def wrapattr(cls, stream, method, total=None, bytes=True, **tqdm_kwargs): + """ + stream : file-like object. + method : str, "read" or "write". The result of `read()` and + the first argument of `write()` should have a `len()`. + + >>> with tqdm.wrapattr(file_obj, "read", total=file_obj.size) as fobj: + ... while True: + ... chunk = fobj.read(chunk_size) + ... if not chunk: + ... break + """ + with cls(total=total, **tqdm_kwargs) as t: + if bytes: + t.unit = "B" + t.unit_scale = True + t.unit_divisor = 1024 + yield CallbackIOWrapper(t.update, stream, method) + + +def trange(*args, **kwargs): + """Shortcut for tqdm(range(*args), **kwargs).""" + return tqdm(range(*args), **kwargs) diff --git a/env/lib/python3.13/site-packages/tqdm/tk.py b/env/lib/python3.13/site-packages/tqdm/tk.py new file mode 100644 index 0000000000000000000000000000000000000000..788303c8687e007338ce816bf9afeec8581f0188 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/tk.py @@ -0,0 +1,196 @@ +""" +Tkinter GUI progressbar decorator for iterators. + +Usage: +>>> from tqdm.tk import trange, tqdm +>>> for i in trange(10): +... ... +""" +import re +import sys +import tkinter +import tkinter.ttk as ttk +from warnings import warn + +from .std import TqdmExperimentalWarning, TqdmWarning +from .std import tqdm as std_tqdm + +__author__ = {"github.com/": ["richardsheridan", "casperdcl"]} +__all__ = ['tqdm_tk', 'ttkrange', 'tqdm', 'trange'] + + +class tqdm_tk(std_tqdm): # pragma: no cover + """ + Experimental Tkinter GUI version of tqdm! + + Note: Window interactivity suffers if `tqdm_tk` is not running within + a Tkinter mainloop and values are generated infrequently. In this case, + consider calling `tqdm_tk.refresh()` frequently in the Tk thread. + """ + + # TODO: @classmethod: write()? + + def __init__(self, *args, **kwargs): + """ + This class accepts the following parameters *in addition* to + the parameters accepted by `tqdm`. + + Parameters + ---------- + grab : bool, optional + Grab the input across all windows of the process. + tk_parent : `tkinter.Wm`, optional + Parent Tk window. + cancel_callback : Callable, optional + Create a cancel button and set `cancel_callback` to be called + when the cancel or window close button is clicked. + """ + kwargs = kwargs.copy() + kwargs['gui'] = True + # convert disable = None to False + kwargs['disable'] = bool(kwargs.get('disable', False)) + self._warn_leave = 'leave' in kwargs + grab = kwargs.pop('grab', False) + tk_parent = kwargs.pop('tk_parent', None) + self._cancel_callback = kwargs.pop('cancel_callback', None) + super().__init__(*args, **kwargs) + + if self.disable: + return + + if tk_parent is None: # Discover parent widget + try: + tk_parent = tkinter._default_root + except AttributeError: + raise AttributeError( + "`tk_parent` required when using `tkinter.NoDefaultRoot()`") + if tk_parent is None: # use new default root window as display + self._tk_window = tkinter.Tk() + else: # some other windows already exist + self._tk_window = tkinter.Toplevel() + else: + self._tk_window = tkinter.Toplevel(tk_parent) + + warn("GUI is experimental/alpha", TqdmExperimentalWarning, stacklevel=2) + self._tk_dispatching = self._tk_dispatching_helper() + + self._tk_window.protocol("WM_DELETE_WINDOW", self.cancel) + self._tk_window.wm_title(self.desc) + self._tk_window.wm_attributes("-topmost", 1) + self._tk_window.after(0, lambda: self._tk_window.wm_attributes("-topmost", 0)) + self._tk_n_var = tkinter.DoubleVar(self._tk_window, value=0) + self._tk_text_var = tkinter.StringVar(self._tk_window) + pbar_frame = ttk.Frame(self._tk_window, padding=5) + pbar_frame.pack() + _tk_label = ttk.Label(pbar_frame, textvariable=self._tk_text_var, + wraplength=600, anchor="center", justify="center") + _tk_label.pack() + self._tk_pbar = ttk.Progressbar( + pbar_frame, variable=self._tk_n_var, length=450) + if self.total is not None: + self._tk_pbar.configure(maximum=self.total) + else: + self._tk_pbar.configure(mode="indeterminate") + self._tk_pbar.pack() + if self._cancel_callback is not None: + _tk_button = ttk.Button(pbar_frame, text="Cancel", command=self.cancel) + _tk_button.pack() + if grab: + self._tk_window.grab_set() + + def close(self): + if self.disable: + return + + self.disable = True + + with self.get_lock(): + self._instances.remove(self) + + def _close(): + self._tk_window.after('idle', self._tk_window.destroy) + if not self._tk_dispatching: + self._tk_window.update() + + self._tk_window.protocol("WM_DELETE_WINDOW", _close) + + # if leave is set but we are self-dispatching, the left window is + # totally unresponsive unless the user manually dispatches + if not self.leave: + _close() + elif not self._tk_dispatching: + if self._warn_leave: + warn("leave flag ignored if not in tkinter mainloop", + TqdmWarning, stacklevel=2) + _close() + + def clear(self, *_, **__): + pass + + def display(self, *_, **__): + self._tk_n_var.set(self.n) + d = self.format_dict + # remove {bar} + d['bar_format'] = (d['bar_format'] or "{l_bar}{r_bar}").replace( + "{bar}", "") + msg = self.format_meter(**d) + if '' in msg: + msg = "".join(re.split(r'\|?\|?', msg, maxsplit=1)) + self._tk_text_var.set(msg) + if not self._tk_dispatching: + self._tk_window.update() + + def set_description(self, desc=None, refresh=True): + self.set_description_str(desc, refresh) + + def set_description_str(self, desc=None, refresh=True): + self.desc = desc + if not self.disable: + self._tk_window.wm_title(desc) + if refresh and not self._tk_dispatching: + self._tk_window.update() + + def cancel(self): + """ + `cancel_callback()` followed by `close()` + when close/cancel buttons clicked. + """ + if self._cancel_callback is not None: + self._cancel_callback() + self.close() + + def reset(self, total=None): + """ + Resets to 0 iterations for repeated use. + + Parameters + ---------- + total : int or float, optional. Total to use for the new bar. + """ + if hasattr(self, '_tk_pbar'): + if total is None: + self._tk_pbar.configure(maximum=100, mode="indeterminate") + else: + self._tk_pbar.configure(maximum=total, mode="determinate") + super().reset(total=total) + + @staticmethod + def _tk_dispatching_helper(): + """determine if Tkinter mainloop is dispatching events""" + codes = {tkinter.mainloop.__code__, tkinter.Misc.mainloop.__code__} + for frame in sys._current_frames().values(): + while frame: + if frame.f_code in codes: + return True + frame = frame.f_back + return False + + +def ttkrange(*args, **kwargs): + """Shortcut for `tqdm.tk.tqdm(range(*args), **kwargs)`.""" + return tqdm_tk(range(*args), **kwargs) + + +# Aliases +tqdm = tqdm_tk +trange = ttkrange diff --git a/env/lib/python3.13/site-packages/tqdm/tqdm.1 b/env/lib/python3.13/site-packages/tqdm/tqdm.1 new file mode 100644 index 0000000000000000000000000000000000000000..b90ab4b9ebdd183c98ee8ae0c7f0a65ac676e3b7 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/tqdm.1 @@ -0,0 +1,314 @@ +.\" Automatically generated by Pandoc 1.19.2 +.\" +.TH "TQDM" "1" "2015\-2021" "tqdm User Manuals" "" +.hy +.SH NAME +.PP +tqdm \- fast, extensible progress bar for Python and CLI +.SH SYNOPSIS +.PP +tqdm [\f[I]options\f[]] +.SH DESCRIPTION +.PP +See . +Can be used as a pipe: +.IP +.nf +\f[C] +$\ #\ count\ lines\ of\ code +$\ cat\ *.py\ |\ tqdm\ |\ wc\ \-l +327it\ [00:00,\ 981773.38it/s] +327 + +$\ #\ find\ all\ files +$\ find\ .\ \-name\ "*.py"\ |\ tqdm\ |\ wc\ \-l +432it\ [00:00,\ 833842.30it/s] +432 + +#\ ...\ and\ more\ info +$\ find\ .\ \-name\ \[aq]*.py\[aq]\ \-exec\ wc\ \-l\ \\{}\ \\;\ \\ +\ \ |\ tqdm\ \-\-total\ 432\ \-\-unit\ files\ \-\-desc\ counting\ \\ +\ \ |\ awk\ \[aq]{\ sum\ +=\ $1\ };\ END\ {\ print\ sum\ }\[aq] +counting:\ 100%|█████████|\ 432/432\ [00:00<00:00,\ 794361.83files/s] +131998 +\f[] +.fi +.SH OPTIONS +.TP +.B \-h, \-\-help +Print this help and exit. +.RS +.RE +.TP +.B \-v, \-\-version +Print version and exit. +.RS +.RE +.TP +.B \-\-desc=\f[I]desc\f[] +str, optional. +Prefix for the progressbar. +.RS +.RE +.TP +.B \-\-total=\f[I]total\f[] +int or float, optional. +The number of expected iterations. +If unspecified, len(iterable) is used if possible. +If float("inf") or as a last resort, only basic progress statistics are +displayed (no ETA, no progressbar). +If \f[C]gui\f[] is True and this parameter needs subsequent updating, +specify an initial arbitrary large positive number, e.g. +9e9. +.RS +.RE +.TP +.B \-\-leave +bool, optional. +If [default: True], keeps all traces of the progressbar upon termination +of iteration. +If \f[C]None\f[], will leave only if \f[C]position\f[] is \f[C]0\f[]. +.RS +.RE +.TP +.B \-\-ncols=\f[I]ncols\f[] +int, optional. +The width of the entire output message. +If specified, dynamically resizes the progressbar to stay within this +bound. +If unspecified, attempts to use environment width. +The fallback is a meter width of 10 and no limit for the counter and +statistics. +If 0, will not print any meter (only stats). +.RS +.RE +.TP +.B \-\-mininterval=\f[I]mininterval\f[] +float, optional. +Minimum progress display update interval [default: 0.1] seconds. +.RS +.RE +.TP +.B \-\-maxinterval=\f[I]maxinterval\f[] +float, optional. +Maximum progress display update interval [default: 10] seconds. +Automatically adjusts \f[C]miniters\f[] to correspond to +\f[C]mininterval\f[] after long display update lag. +Only works if \f[C]dynamic_miniters\f[] or monitor thread is enabled. +.RS +.RE +.TP +.B \-\-miniters=\f[I]miniters\f[] +int or float, optional. +Minimum progress display update interval, in iterations. +If 0 and \f[C]dynamic_miniters\f[], will automatically adjust to equal +\f[C]mininterval\f[] (more CPU efficient, good for tight loops). +If > 0, will skip display of specified number of iterations. +Tweak this and \f[C]mininterval\f[] to get very efficient loops. +If your progress is erratic with both fast and slow iterations (network, +skipping items, etc) you should set miniters=1. +.RS +.RE +.TP +.B \-\-ascii=\f[I]ascii\f[] +bool or str, optional. +If unspecified or False, use unicode (smooth blocks) to fill the meter. +The fallback is to use ASCII characters " 123456789#". +.RS +.RE +.TP +.B \-\-disable +bool, optional. +Whether to disable the entire progressbar wrapper [default: False]. +If set to None, disable on non\-TTY. +.RS +.RE +.TP +.B \-\-unit=\f[I]unit\f[] +str, optional. +String that will be used to define the unit of each iteration [default: +it]. +.RS +.RE +.TP +.B \-\-unit\-scale=\f[I]unit_scale\f[] +bool or int or float, optional. +If 1 or True, the number of iterations will be reduced/scaled +automatically and a metric prefix following the International System of +Units standard will be added (kilo, mega, etc.) [default: False]. +If any other non\-zero number, will scale \f[C]total\f[] and \f[C]n\f[]. +.RS +.RE +.TP +.B \-\-dynamic\-ncols +bool, optional. +If set, constantly alters \f[C]ncols\f[] and \f[C]nrows\f[] to the +environment (allowing for window resizes) [default: False]. +.RS +.RE +.TP +.B \-\-smoothing=\f[I]smoothing\f[] +float, optional. +Exponential moving average smoothing factor for speed estimates (ignored +in GUI mode). +Ranges from 0 (average speed) to 1 (current/instantaneous speed) +[default: 0.3]. +.RS +.RE +.TP +.B \-\-bar\-format=\f[I]bar_format\f[] +str, optional. +Specify a custom bar string formatting. +May impact performance. +[default: \[aq]{l_bar}{bar}{r_bar}\[aq]], where l_bar=\[aq]{desc}: +{percentage:3.0f}%|\[aq] and r_bar=\[aq]| {n_fmt}/{total_fmt} +[{elapsed}<{remaining}, \[aq] \[aq]{rate_fmt}{postfix}]\[aq] Possible +vars: l_bar, bar, r_bar, n, n_fmt, total, total_fmt, percentage, +elapsed, elapsed_s, ncols, nrows, desc, unit, rate, rate_fmt, +rate_noinv, rate_noinv_fmt, rate_inv, rate_inv_fmt, postfix, +unit_divisor, remaining, remaining_s, eta. +Note that a trailing ": " is automatically removed after {desc} if the +latter is empty. +.RS +.RE +.TP +.B \-\-initial=\f[I]initial\f[] +int or float, optional. +The initial counter value. +Useful when restarting a progress bar [default: 0]. +If using float, consider specifying \f[C]{n:.3f}\f[] or similar in +\f[C]bar_format\f[], or specifying \f[C]unit_scale\f[]. +.RS +.RE +.TP +.B \-\-position=\f[I]position\f[] +int, optional. +Specify the line offset to print this bar (starting from 0) Automatic if +unspecified. +Useful to manage multiple bars at once (eg, from threads). +.RS +.RE +.TP +.B \-\-postfix=\f[I]postfix\f[] +dict or *, optional. +Specify additional stats to display at the end of the bar. +Calls \f[C]set_postfix(**postfix)\f[] if possible (dict). +.RS +.RE +.TP +.B \-\-unit\-divisor=\f[I]unit_divisor\f[] +float, optional. +[default: 1000], ignored unless \f[C]unit_scale\f[] is True. +.RS +.RE +.TP +.B \-\-write\-bytes +bool, optional. +Whether to write bytes. +If (default: False) will write unicode. +.RS +.RE +.TP +.B \-\-lock\-args=\f[I]lock_args\f[] +tuple, optional. +Passed to \f[C]refresh\f[] for intermediate output (initialisation, +iterating, and updating). +.RS +.RE +.TP +.B \-\-nrows=\f[I]nrows\f[] +int, optional. +The screen height. +If specified, hides nested bars outside this bound. +If unspecified, attempts to use environment height. +The fallback is 20. +.RS +.RE +.TP +.B \-\-colour=\f[I]colour\f[] +str, optional. +Bar colour (e.g. +\[aq]green\[aq], \[aq]#00ff00\[aq]). +.RS +.RE +.TP +.B \-\-delay=\f[I]delay\f[] +float, optional. +Don\[aq]t display until [default: 0] seconds have elapsed. +.RS +.RE +.TP +.B \-\-delim=\f[I]delim\f[] +chr, optional. +Delimiting character [default: \[aq]\\n\[aq]]. +Use \[aq]\\0\[aq] for null. +N.B.: on Windows systems, Python converts \[aq]\\n\[aq] to +\[aq]\\r\\n\[aq]. +.RS +.RE +.TP +.B \-\-buf\-size=\f[I]buf_size\f[] +int, optional. +String buffer size in bytes [default: 256] used when \f[C]delim\f[] is +specified. +.RS +.RE +.TP +.B \-\-bytes +bool, optional. +If true, will count bytes, ignore \f[C]delim\f[], and default +\f[C]unit_scale\f[] to True, \f[C]unit_divisor\f[] to 1024, and +\f[C]unit\f[] to \[aq]B\[aq]. +.RS +.RE +.TP +.B \-\-tee +bool, optional. +If true, passes \f[C]stdin\f[] to both \f[C]stderr\f[] and +\f[C]stdout\f[]. +.RS +.RE +.TP +.B \-\-update +bool, optional. +If true, will treat input as newly elapsed iterations, i.e. +numbers to pass to \f[C]update()\f[]. +Note that this is slow (~2e5 it/s) since every input must be decoded as +a number. +.RS +.RE +.TP +.B \-\-update\-to +bool, optional. +If true, will treat input as total elapsed iterations, i.e. +numbers to assign to \f[C]self.n\f[]. +Note that this is slow (~2e5 it/s) since every input must be decoded as +a number. +.RS +.RE +.TP +.B \-\-null +bool, optional. +If true, will discard input (no stdout). +.RS +.RE +.TP +.B \-\-manpath=\f[I]manpath\f[] +str, optional. +Directory in which to install tqdm man pages. +.RS +.RE +.TP +.B \-\-comppath=\f[I]comppath\f[] +str, optional. +Directory in which to place tqdm completion. +.RS +.RE +.TP +.B \-\-log=\f[I]log\f[] +str, optional. +CRITICAL|FATAL|ERROR|WARN(ING)|[default: \[aq]INFO\[aq]]|DEBUG|NOTSET. +.RS +.RE +.SH AUTHORS +tqdm developers . diff --git a/env/lib/python3.13/site-packages/tqdm/utils.py b/env/lib/python3.13/site-packages/tqdm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af3ec7ded55daa98e1f268a3ee891e9a6bd72974 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/utils.py @@ -0,0 +1,399 @@ +""" +General helpers required for `tqdm.std`. +""" +import os +import re +import sys +from functools import partial, partialmethod, wraps +from inspect import signature +# TODO consider using wcswidth third-party package for 0-width characters +from unicodedata import east_asian_width +from warnings import warn +from weakref import proxy + +_range, _unich, _unicode, _basestring = range, chr, str, str +CUR_OS = sys.platform +IS_WIN = any(CUR_OS.startswith(i) for i in ['win32', 'cygwin']) +IS_NIX = any(CUR_OS.startswith(i) for i in ['aix', 'linux', 'darwin', 'freebsd']) +RE_ANSI = re.compile(r"\x1b\[[;\d]*[A-Za-z]") + +try: + if IS_WIN: + import colorama + else: + raise ImportError +except ImportError: + colorama = None +else: + try: + colorama.init(strip=False) + except TypeError: + colorama.init() + + +def envwrap(prefix, types=None, is_method=False): + """ + Override parameter defaults via `os.environ[prefix + param_name]`. + Maps UPPER_CASE env vars map to lower_case param names. + camelCase isn't supported (because Windows ignores case). + + Precedence (highest first): + + - call (`foo(a=3)`) + - environ (`FOO_A=2`) + - signature (`def foo(a=1)`) + + Parameters + ---------- + prefix : str + Env var prefix, e.g. "FOO_" + types : dict, optional + Fallback mappings `{'param_name': type, ...}` if types cannot be + inferred from function signature. + Consider using `types=collections.defaultdict(lambda: ast.literal_eval)`. + is_method : bool, optional + Whether to use `functools.partialmethod`. If (default: False) use `functools.partial`. + + Examples + -------- + ``` + $ cat foo.py + from tqdm.utils import envwrap + @envwrap("FOO_") + def test(a=1, b=2, c=3): + print(f"received: a={a}, b={b}, c={c}") + + $ FOO_A=42 FOO_C=1337 python -c 'import foo; foo.test(c=99)' + received: a=42, b=2, c=99 + ``` + """ + if types is None: + types = {} + i = len(prefix) + env_overrides = {k[i:].lower(): v for k, v in os.environ.items() if k.startswith(prefix)} + part = partialmethod if is_method else partial + + def wrap(func): + params = signature(func).parameters + # ignore unknown env vars + overrides = {k: v for k, v in env_overrides.items() if k in params} + # infer overrides' `type`s + for k in overrides: + param = params[k] + if param.annotation is not param.empty: # typehints + for typ in getattr(param.annotation, '__args__', (param.annotation,)): + try: + overrides[k] = typ(overrides[k]) + except Exception: + pass + else: + break + elif param.default is not None: # type of default value + overrides[k] = type(param.default)(overrides[k]) + else: + try: # `types` fallback + overrides[k] = types[k](overrides[k]) + except KeyError: # keep unconverted (`str`) + pass + return part(func, **overrides) + return wrap + + +class FormatReplace(object): + """ + >>> a = FormatReplace('something') + >>> f"{a:5d}" + 'something' + """ # NOQA: P102 + def __init__(self, replace=''): + self.replace = replace + self.format_called = 0 + + def __format__(self, _): + self.format_called += 1 + return self.replace + + +class Comparable(object): + """Assumes child has self._comparable attr/@property""" + def __lt__(self, other): + return self._comparable < other._comparable + + def __le__(self, other): + return (self < other) or (self == other) + + def __eq__(self, other): + return self._comparable == other._comparable + + def __ne__(self, other): + return not self == other + + def __gt__(self, other): + return not self <= other + + def __ge__(self, other): + return not self < other + + +class ObjectWrapper(object): + def __getattr__(self, name): + return getattr(self._wrapped, name) + + def __setattr__(self, name, value): + return setattr(self._wrapped, name, value) + + def wrapper_getattr(self, name): + """Actual `self.getattr` rather than self._wrapped.getattr""" + try: + return object.__getattr__(self, name) + except AttributeError: # py2 + return getattr(self, name) + + def wrapper_setattr(self, name, value): + """Actual `self.setattr` rather than self._wrapped.setattr""" + return object.__setattr__(self, name, value) + + def __init__(self, wrapped): + """ + Thin wrapper around a given object + """ + self.wrapper_setattr('_wrapped', wrapped) + + +class SimpleTextIOWrapper(ObjectWrapper): + """ + Change only `.write()` of the wrapped object by encoding the passed + value and passing the result to the wrapped object's `.write()` method. + """ + # pylint: disable=too-few-public-methods + def __init__(self, wrapped, encoding): + super().__init__(wrapped) + self.wrapper_setattr('encoding', encoding) + + def write(self, s): + """ + Encode `s` and pass to the wrapped object's `.write()` method. + """ + return self._wrapped.write(s.encode(self.wrapper_getattr('encoding'))) + + def __eq__(self, other): + return self._wrapped == getattr(other, '_wrapped', other) + + +class DisableOnWriteError(ObjectWrapper): + """ + Disable the given `tqdm_instance` upon `write()` or `flush()` errors. + """ + @staticmethod + def disable_on_exception(tqdm_instance, func): + """ + Quietly set `tqdm_instance.miniters=inf` if `func` raises `errno=5`. + """ + tqdm_instance = proxy(tqdm_instance) + + def inner(*args, **kwargs): + try: + return func(*args, **kwargs) + except OSError as e: + if e.errno != 5: + raise + try: + tqdm_instance.miniters = float('inf') + except ReferenceError: + pass + except ValueError as e: + if 'closed' not in str(e): + raise + try: + tqdm_instance.miniters = float('inf') + except ReferenceError: + pass + return inner + + def __init__(self, wrapped, tqdm_instance): + super().__init__(wrapped) + if hasattr(wrapped, 'write'): + self.wrapper_setattr( + 'write', self.disable_on_exception(tqdm_instance, wrapped.write)) + if hasattr(wrapped, 'flush'): + self.wrapper_setattr( + 'flush', self.disable_on_exception(tqdm_instance, wrapped.flush)) + + def __eq__(self, other): + return self._wrapped == getattr(other, '_wrapped', other) + + +class CallbackIOWrapper(ObjectWrapper): + def __init__(self, callback, stream, method="read"): + """ + Wrap a given `file`-like object's `read()` or `write()` to report + lengths to the given `callback` + """ + super().__init__(stream) + func = getattr(stream, method) + if method == "write": + @wraps(func) + def write(data, *args, **kwargs): + res = func(data, *args, **kwargs) + callback(len(data)) + return res + self.wrapper_setattr('write', write) + elif method == "read": + @wraps(func) + def read(*args, **kwargs): + data = func(*args, **kwargs) + callback(len(data)) + return data + self.wrapper_setattr('read', read) + else: + raise KeyError("Can only wrap read/write methods") + + +def _is_utf(encoding): + try: + u'\u2588\u2589'.encode(encoding) + except UnicodeEncodeError: + return False + except Exception: + try: + return encoding.lower().startswith('utf-') or ('U8' == encoding) + except Exception: + return False + else: + return True + + +def _supports_unicode(fp): + try: + return _is_utf(fp.encoding) + except AttributeError: + return False + + +def _is_ascii(s): + if isinstance(s, str): + for c in s: + if ord(c) > 255: + return False + return True + return _supports_unicode(s) + + +def _screen_shape_wrapper(): # pragma: no cover + """ + Return a function which returns console dimensions (width, height). + Supported: linux, osx, windows, cygwin. + """ + _screen_shape = None + if IS_WIN: + _screen_shape = _screen_shape_windows + if _screen_shape is None: + _screen_shape = _screen_shape_tput + if IS_NIX: + _screen_shape = _screen_shape_linux + return _screen_shape + + +def _screen_shape_windows(fp): # pragma: no cover + try: + import struct + from ctypes import create_string_buffer, windll + from sys import stdin, stdout + + io_handle = -12 # assume stderr + if fp == stdin: + io_handle = -10 + elif fp == stdout: + io_handle = -11 + + h = windll.kernel32.GetStdHandle(io_handle) + csbi = create_string_buffer(22) + res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi) + if res: + (_bufx, _bufy, _curx, _cury, _wattr, left, top, right, bottom, + _maxx, _maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw) + return right - left, bottom - top # +1 + except Exception: # nosec + pass + return None, None + + +def _screen_shape_tput(*_): # pragma: no cover + """cygwin xterm (windows)""" + try: + import shlex + from subprocess import check_call # nosec + return [int(check_call(shlex.split('tput ' + i))) - 1 + for i in ('cols', 'lines')] + except Exception: # nosec + pass + return None, None + + +def _screen_shape_linux(fp): # pragma: no cover + + try: + from array import array + from fcntl import ioctl + from termios import TIOCGWINSZ + except ImportError: + return None, None + else: + try: + rows, cols = array('h', ioctl(fp, TIOCGWINSZ, '\0' * 8))[:2] + return cols, rows + except Exception: + try: + return [int(os.environ[i]) - 1 for i in ("COLUMNS", "LINES")] + except (KeyError, ValueError): + return None, None + + +def _environ_cols_wrapper(): # pragma: no cover + """ + Return a function which returns console width. + Supported: linux, osx, windows, cygwin. + """ + warn("Use `_screen_shape_wrapper()(file)[0]` instead of" + " `_environ_cols_wrapper()(file)`", DeprecationWarning, stacklevel=2) + shape = _screen_shape_wrapper() + if not shape: + return None + + @wraps(shape) + def inner(fp): + return shape(fp)[0] + + return inner + + +def _term_move_up(): # pragma: no cover + return '' if (os.name == 'nt') and (colorama is None) else '\x1b[A' + + +def _text_width(s): + return sum(2 if east_asian_width(ch) in 'FW' else 1 for ch in str(s)) + + +def disp_len(data): + """ + Returns the real on-screen length of a string which may contain + ANSI control codes and wide chars. + """ + return _text_width(RE_ANSI.sub('', data)) + + +def disp_trim(data, length): + """ + Trim a string which may contain ANSI control characters. + """ + if len(data) == disp_len(data): + return data[:length] + + ansi_present = bool(RE_ANSI.search(data)) + while disp_len(data) > length: # carefully delete one char at a time + data = data[:-1] + if ansi_present and bool(RE_ANSI.search(data)): + # assume ANSI reset is required + return data if data.endswith("\033[0m") else data + "\033[0m" + return data diff --git a/env/lib/python3.13/site-packages/tqdm/version.py b/env/lib/python3.13/site-packages/tqdm/version.py new file mode 100644 index 0000000000000000000000000000000000000000..11cbaea79d1f4f46f9ae4bea542d7c66ded96e34 --- /dev/null +++ b/env/lib/python3.13/site-packages/tqdm/version.py @@ -0,0 +1,9 @@ +"""`tqdm` version detector. Precedence: installed dist, git, 'UNKNOWN'.""" +try: + from ._dist_ver import __version__ +except ImportError: + try: + from setuptools_scm import get_version + __version__ = get_version(root='..', relative_to=__file__) + except (ImportError, LookupError): + __version__ = "UNKNOWN"