diff --git a/.venv/lib/python3.11/site-packages/fsspec/__init__.py b/.venv/lib/python3.11/site-packages/fsspec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7af775f77cd67734aebd25ce4e051be87e64fa8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/__init__.py @@ -0,0 +1,69 @@ +from importlib.metadata import entry_points + +from . import caching +from ._version import __version__ # noqa: F401 +from .callbacks import Callback +from .compression import available_compressions +from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs +from .exceptions import FSTimeoutError +from .mapping import FSMap, get_mapper +from .registry import ( + available_protocols, + filesystem, + get_filesystem_class, + register_implementation, + registry, +) +from .spec import AbstractFileSystem + +__all__ = [ + "AbstractFileSystem", + "FSTimeoutError", + "FSMap", + "filesystem", + "register_implementation", + "get_filesystem_class", + "get_fs_token_paths", + "get_mapper", + "open", + "open_files", + "open_local", + "registry", + "caching", + "Callback", + "available_protocols", + "available_compressions", + "url_to_fs", +] + + +def process_entries(): + if entry_points is not None: + try: + eps = entry_points() + except TypeError: + pass # importlib-metadata < 0.8 + else: + if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0 + specs = eps.select(group="fsspec.specs") + else: + specs = eps.get("fsspec.specs", []) + registered_names = {} + for spec in specs: + err_msg = f"Unable to load filesystem from {spec}" + name = spec.name + if name in registered_names: + continue + registered_names[name] = True + register_implementation( + name, + spec.value.replace(":", "."), + errtxt=err_msg, + # We take our implementations as the ones to overload with if + # for some reason we encounter some, may be the same, already + # registered + clobber=True, + ) + + +process_entries() diff --git a/.venv/lib/python3.11/site-packages/fsspec/_version.py b/.venv/lib/python3.11/site-packages/fsspec/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..dc54ff0dfac3a15855cae1992c875fd9ddbb7a8d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '2025.2.0' +__version_tuple__ = version_tuple = (2025, 2, 0) diff --git a/.venv/lib/python3.11/site-packages/fsspec/archive.py b/.venv/lib/python3.11/site-packages/fsspec/archive.py new file mode 100644 index 0000000000000000000000000000000000000000..13a4da8df7c9405297cdd7d37476be2f725b2f57 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/archive.py @@ -0,0 +1,75 @@ +import operator + +from fsspec import AbstractFileSystem +from fsspec.utils import tokenize + + +class AbstractArchiveFileSystem(AbstractFileSystem): + """ + A generic superclass for implementing Archive-based filesystems. + + Currently, it is shared amongst + :class:`~fsspec.implementations.zip.ZipFileSystem`, + :class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and + :class:`~fsspec.implementations.tar.TarFileSystem`. + """ + + def __str__(self): + return f"" + + __repr__ = __str__ + + def ukey(self, path): + return tokenize(path, self.fo, self.protocol) + + def _all_dirnames(self, paths): + """Returns *all* directory names for each path in paths, including intermediate + ones. + + Parameters + ---------- + paths: Iterable of path strings + """ + if len(paths) == 0: + return set() + + dirnames = {self._parent(path) for path in paths} - {self.root_marker} + return dirnames | self._all_dirnames(dirnames) + + def info(self, path, **kwargs): + self._get_dirs() + path = self._strip_protocol(path) + if path in {"", "/"} and self.dir_cache: + return {"name": "", "type": "directory", "size": 0} + if path in self.dir_cache: + return self.dir_cache[path] + elif path + "/" in self.dir_cache: + return self.dir_cache[path + "/"] + else: + raise FileNotFoundError(path) + + def ls(self, path, detail=True, **kwargs): + self._get_dirs() + paths = {} + for p, f in self.dir_cache.items(): + p = p.rstrip("/") + if "/" in p: + root = p.rsplit("/", 1)[0] + else: + root = "" + if root == path.rstrip("/"): + paths[p] = f + elif all( + (a == b) + for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) + ): + # root directory entry + ppath = p.rstrip("/").split("/", 1)[0] + if ppath not in paths: + out = {"name": ppath, "size": 0, "type": "directory"} + paths[ppath] = out + if detail: + out = sorted(paths.values(), key=operator.itemgetter("name")) + return out + else: + return sorted(paths) diff --git a/.venv/lib/python3.11/site-packages/fsspec/asyn.py b/.venv/lib/python3.11/site-packages/fsspec/asyn.py new file mode 100644 index 0000000000000000000000000000000000000000..adeb88fb4c72932457ac31e0ff0ff5c26438e317 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/asyn.py @@ -0,0 +1,1098 @@ +import asyncio +import asyncio.events +import functools +import inspect +import io +import numbers +import os +import re +import threading +from contextlib import contextmanager +from glob import has_magic +from typing import TYPE_CHECKING, Iterable + +from .callbacks import DEFAULT_CALLBACK +from .exceptions import FSTimeoutError +from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep +from .spec import AbstractBufferedFile, AbstractFileSystem +from .utils import glob_translate, is_exception, other_paths + +private = re.compile("_[^_]") +iothread = [None] # dedicated fsspec IO thread +loop = [None] # global event loop for any non-async instance +_lock = None # global lock placeholder +get_running_loop = asyncio.get_running_loop + + +def get_lock(): + """Allocate or return a threading lock. + + The lock is allocated on first use to allow setting one lock per forked process. + """ + global _lock + if not _lock: + _lock = threading.Lock() + return _lock + + +def reset_lock(): + """Reset the global lock. + + This should be called only on the init of a forked process to reset the lock to + None, enabling the new forked process to get a new lock. + """ + global _lock + + iothread[0] = None + loop[0] = None + _lock = None + + +async def _runner(event, coro, result, timeout=None): + timeout = timeout if timeout else None # convert 0 or 0.0 to None + if timeout is not None: + coro = asyncio.wait_for(coro, timeout=timeout) + try: + result[0] = await coro + except Exception as ex: + result[0] = ex + finally: + event.set() + + +def sync(loop, func, *args, timeout=None, **kwargs): + """ + Make loop run coroutine until it returns. Runs in other thread + + Examples + -------- + >>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args, + timeout=timeout, **kwargs) + """ + timeout = timeout if timeout else None # convert 0 or 0.0 to None + # NB: if the loop is not running *yet*, it is OK to submit work + # and we will wait for it + if loop is None or loop.is_closed(): + raise RuntimeError("Loop is not running") + try: + loop0 = asyncio.events.get_running_loop() + if loop0 is loop: + raise NotImplementedError("Calling sync() from within a running loop") + except NotImplementedError: + raise + except RuntimeError: + pass + coro = func(*args, **kwargs) + result = [None] + event = threading.Event() + asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop) + while True: + # this loops allows thread to get interrupted + if event.wait(1): + break + if timeout is not None: + timeout -= 1 + if timeout < 0: + raise FSTimeoutError + + return_result = result[0] + if isinstance(return_result, asyncio.TimeoutError): + # suppress asyncio.TimeoutError, raise FSTimeoutError + raise FSTimeoutError from return_result + elif isinstance(return_result, BaseException): + raise return_result + else: + return return_result + + +def sync_wrapper(func, obj=None): + """Given a function, make so can be called in blocking contexts + + Leave obj=None if defining within a class. Pass the instance if attaching + as an attribute of the instance. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + self = obj or args[0] + return sync(self.loop, func, *args, **kwargs) + + return wrapper + + +@contextmanager +def _selector_policy(): + original_policy = asyncio.get_event_loop_policy() + try: + if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"): + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + + yield + finally: + asyncio.set_event_loop_policy(original_policy) + + +def get_loop(): + """Create or return the default fsspec IO loop + + The loop will be running on a separate thread. + """ + if loop[0] is None: + with get_lock(): + # repeat the check just in case the loop got filled between the + # previous two calls from another thread + if loop[0] is None: + with _selector_policy(): + loop[0] = asyncio.new_event_loop() + th = threading.Thread(target=loop[0].run_forever, name="fsspecIO") + th.daemon = True + th.start() + iothread[0] = th + return loop[0] + + +if TYPE_CHECKING: + import resource + + ResourceError = resource.error +else: + try: + import resource + except ImportError: + resource = None + ResourceError = OSError + else: + ResourceError = getattr(resource, "error", OSError) + +_DEFAULT_BATCH_SIZE = 128 +_NOFILES_DEFAULT_BATCH_SIZE = 1280 + + +def _get_batch_size(nofiles=False): + from fsspec.config import conf + + if nofiles: + if "nofiles_gather_batch_size" in conf: + return conf["nofiles_gather_batch_size"] + else: + if "gather_batch_size" in conf: + return conf["gather_batch_size"] + if nofiles: + return _NOFILES_DEFAULT_BATCH_SIZE + if resource is None: + return _DEFAULT_BATCH_SIZE + + try: + soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE) + except (ImportError, ValueError, ResourceError): + return _DEFAULT_BATCH_SIZE + + if soft_limit == resource.RLIM_INFINITY: + return -1 + else: + return soft_limit // 8 + + +def running_async() -> bool: + """Being executed by an event loop?""" + try: + asyncio.get_running_loop() + return True + except RuntimeError: + return False + + +async def _run_coros_in_chunks( + coros, + batch_size=None, + callback=DEFAULT_CALLBACK, + timeout=None, + return_exceptions=False, + nofiles=False, +): + """Run the given coroutines in chunks. + + Parameters + ---------- + coros: list of coroutines to run + batch_size: int or None + Number of coroutines to submit/wait on simultaneously. + If -1, then it will not be any throttling. If + None, it will be inferred from _get_batch_size() + callback: fsspec.callbacks.Callback instance + Gets a relative_update when each coroutine completes + timeout: number or None + If given, each coroutine times out after this time. Note that, since + there are multiple batches, the total run time of this function will in + general be longer + return_exceptions: bool + Same meaning as in asyncio.gather + nofiles: bool + If inferring the batch_size, does this operation involve local files? + If yes, you normally expect smaller batches. + """ + + if batch_size is None: + batch_size = _get_batch_size(nofiles=nofiles) + + if batch_size == -1: + batch_size = len(coros) + + assert batch_size > 0 + + async def _run_coro(coro, i): + try: + return await asyncio.wait_for(coro, timeout=timeout), i + except Exception as e: + if not return_exceptions: + raise + return e, i + finally: + callback.relative_update(1) + + i = 0 + n = len(coros) + results = [None] * n + pending = set() + + while pending or i < n: + while len(pending) < batch_size and i < n: + pending.add(asyncio.ensure_future(_run_coro(coros[i], i))) + i += 1 + + if not pending: + break + + done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + while done: + result, k = await done.pop() + results[k] = result + + return results + + +# these methods should be implemented as async by any async-able backend +async_methods = [ + "_ls", + "_cat_file", + "_get_file", + "_put_file", + "_rm_file", + "_cp_file", + "_pipe_file", + "_expand_path", + "_info", + "_isfile", + "_isdir", + "_exists", + "_walk", + "_glob", + "_find", + "_du", + "_size", + "_mkdir", + "_makedirs", +] + + +class AsyncFileSystem(AbstractFileSystem): + """Async file operations, default implementations + + Passes bulk operations to asyncio.gather for concurrent operation. + + Implementations that have concurrent batch operations and/or async methods + should inherit from this class instead of AbstractFileSystem. Docstrings are + copied from the un-underscored method in AbstractFileSystem, if not given. + """ + + # note that methods do not have docstring here; they will be copied + # for _* methods and inferred for overridden methods. + + async_impl = True + mirror_sync_methods = True + disable_throttling = False + + def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs): + self.asynchronous = asynchronous + self._pid = os.getpid() + if not asynchronous: + self._loop = loop or get_loop() + else: + self._loop = None + self.batch_size = batch_size + super().__init__(*args, **kwargs) + + @property + def loop(self): + if self._pid != os.getpid(): + raise RuntimeError("This class is not fork-safe") + return self._loop + + async def _rm_file(self, path, **kwargs): + raise NotImplementedError + + async def _rm(self, path, recursive=False, batch_size=None, **kwargs): + # TODO: implement on_error + batch_size = batch_size or self.batch_size + path = await self._expand_path(path, recursive=recursive) + return await _run_coros_in_chunks( + [self._rm_file(p, **kwargs) for p in reversed(path)], + batch_size=batch_size, + nofiles=True, + ) + + async def _cp_file(self, path1, path2, **kwargs): + raise NotImplementedError + + async def _mv_file(self, path1, path2): + await self._cp_file(path1, path2) + await self._rm_file(path1) + + async def _copy( + self, + path1, + path2, + recursive=False, + on_error=None, + maxdepth=None, + batch_size=None, + **kwargs, + ): + if on_error is None and recursive: + on_error = "ignore" + elif on_error is None: + on_error = "raise" + + if isinstance(path1, list) and isinstance(path2, list): + # No need to expand paths when both source and destination + # are provided as lists + paths1 = path1 + paths2 = path2 + else: + source_is_str = isinstance(path1, str) + paths1 = await self._expand_path( + path1, maxdepth=maxdepth, recursive=recursive + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + paths1 = [ + p for p in paths1 if not (trailing_sep(p) or await self._isdir(p)) + ] + if not paths1: + return + + source_is_file = len(paths1) == 1 + dest_is_dir = isinstance(path2, str) and ( + trailing_sep(path2) or await self._isdir(path2) + ) + + exists = source_is_str and ( + (has_magic(path1) and source_is_file) + or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1)) + ) + paths2 = other_paths( + paths1, + path2, + exists=exists, + flatten=not source_is_str, + ) + + batch_size = batch_size or self.batch_size + coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)] + result = await _run_coros_in_chunks( + coros, batch_size=batch_size, return_exceptions=True, nofiles=True + ) + + for ex in filter(is_exception, result): + if on_error == "ignore" and isinstance(ex, FileNotFoundError): + continue + raise ex + + async def _pipe_file(self, path, value, mode="overwrite", **kwargs): + raise NotImplementedError + + async def _pipe(self, path, value=None, batch_size=None, **kwargs): + if isinstance(path, str): + path = {path: value} + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + [self._pipe_file(k, v, **kwargs) for k, v in path.items()], + batch_size=batch_size, + nofiles=True, + ) + + async def _process_limits(self, url, start, end): + """Helper for "Range"-based _cat_file""" + size = None + suff = False + if start is not None and start < 0: + # if start is negative and end None, end is the "suffix length" + if end is None: + end = -start + start = "" + suff = True + else: + size = size or (await self._info(url))["size"] + start = size + start + elif start is None: + start = 0 + if not suff: + if end is not None and end < 0: + if start is not None: + size = size or (await self._info(url))["size"] + end = size + end + elif end is None: + end = "" + if isinstance(end, numbers.Integral): + end -= 1 # bytes range is inclusive + return f"bytes={start}-{end}" + + async def _cat_file(self, path, start=None, end=None, **kwargs): + raise NotImplementedError + + async def _cat( + self, path, recursive=False, on_error="raise", batch_size=None, **kwargs + ): + paths = await self._expand_path(path, recursive=recursive) + coros = [self._cat_file(path, **kwargs) for path in paths] + batch_size = batch_size or self.batch_size + out = await _run_coros_in_chunks( + coros, batch_size=batch_size, nofiles=True, return_exceptions=True + ) + if on_error == "raise": + ex = next(filter(is_exception, out), False) + if ex: + raise ex + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + return { + k: v + for k, v in zip(paths, out) + if on_error != "omit" or not is_exception(v) + } + else: + return out[0] + + async def _cat_ranges( + self, + paths, + starts, + ends, + max_gap=None, + batch_size=None, + on_error="return", + **kwargs, + ): + """Get the contents of byte ranges from one or more files + + Parameters + ---------- + paths: list + A list of of filepaths on this filesystems + starts, ends: int or list + Bytes limits of the read. If using a single int, the same value will be + used to read all the specified files. + """ + # TODO: on_error + if max_gap is not None: + # use utils.merge_offset_ranges + raise NotImplementedError + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, Iterable): + starts = [starts] * len(paths) + if not isinstance(ends, Iterable): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + coros = [ + self._cat_file(p, start=s, end=e, **kwargs) + for p, s, e in zip(paths, starts, ends) + ] + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + coros, batch_size=batch_size, nofiles=True, return_exceptions=True + ) + + async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs): + raise NotImplementedError + + async def _put( + self, + lpath, + rpath, + recursive=False, + callback=DEFAULT_CALLBACK, + batch_size=None, + maxdepth=None, + **kwargs, + ): + """Copy file(s) from local. + + Copies a specific file or tree of files (if recursive=True). If rpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. + + The put_file method will be called concurrently on a batch of files. The + batch_size option can configure the amount of futures that can be executed + at the same time. If it is -1, then all the files will be uploaded concurrently. + The default can be set for this instance by passing "batch_size" in the + constructor, or for all instances by setting the "gather_batch_size" key + in ``fsspec.config.conf``, falling back to 1/8th of the system limit . + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + source_is_str = isinstance(lpath, str) + if source_is_str: + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))] + if not lpaths: + return + + source_is_file = len(lpaths) == 1 + dest_is_dir = isinstance(rpath, str) and ( + trailing_sep(rpath) or await self._isdir(rpath) + ) + + rpath = self._strip_protocol(rpath) + exists = source_is_str and ( + (has_magic(lpath) and source_is_file) + or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath)) + ) + rpaths = other_paths( + lpaths, + rpath, + exists=exists, + flatten=not source_is_str, + ) + + is_dir = {l: os.path.isdir(l) for l in lpaths} + rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]] + file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]] + + await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs]) + batch_size = batch_size or self.batch_size + + coros = [] + callback.set_size(len(file_pairs)) + for lfile, rfile in file_pairs: + put_file = callback.branch_coro(self._put_file) + coros.append(put_file(lfile, rfile, **kwargs)) + + return await _run_coros_in_chunks( + coros, batch_size=batch_size, callback=callback + ) + + async def _get_file(self, rpath, lpath, **kwargs): + raise NotImplementedError + + async def _get( + self, + rpath, + lpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) to local. + + Copies a specific file or tree of files (if recursive=True). If lpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. Can submit a list of paths, which may be glob-patterns + and will be expanded. + + The get_file method will be called concurrently on a batch of files. The + batch_size option can configure the amount of futures that can be executed + at the same time. If it is -1, then all the files will be uploaded concurrently. + The default can be set for this instance by passing "batch_size" in the + constructor, or for all instances by setting the "gather_batch_size" key + in ``fsspec.config.conf``, falling back to 1/8th of the system limit . + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + source_is_str = isinstance(rpath, str) + # First check for rpath trailing slash as _strip_protocol removes it. + source_not_trailing_sep = source_is_str and not trailing_sep(rpath) + rpath = self._strip_protocol(rpath) + rpaths = await self._expand_path( + rpath, recursive=recursive, maxdepth=maxdepth + ) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + rpaths = [ + p for p in rpaths if not (trailing_sep(p) or await self._isdir(p)) + ] + if not rpaths: + return + + lpath = make_path_posix(lpath) + source_is_file = len(rpaths) == 1 + dest_is_dir = isinstance(lpath, str) and ( + trailing_sep(lpath) or LocalFileSystem().isdir(lpath) + ) + + exists = source_is_str and ( + (has_magic(rpath) and source_is_file) + or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep) + ) + lpaths = other_paths( + rpaths, + lpath, + exists=exists, + flatten=not source_is_str, + ) + + [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths] + batch_size = kwargs.pop("batch_size", self.batch_size) + + coros = [] + callback.set_size(len(lpaths)) + for lpath, rpath in zip(lpaths, rpaths): + get_file = callback.branch_coro(self._get_file) + coros.append(get_file(rpath, lpath, **kwargs)) + return await _run_coros_in_chunks( + coros, batch_size=batch_size, callback=callback + ) + + async def _isfile(self, path): + try: + return (await self._info(path))["type"] == "file" + except: # noqa: E722 + return False + + async def _isdir(self, path): + try: + return (await self._info(path))["type"] == "directory" + except OSError: + return False + + async def _size(self, path): + return (await self._info(path)).get("size", None) + + async def _sizes(self, paths, batch_size=None): + batch_size = batch_size or self.batch_size + return await _run_coros_in_chunks( + [self._size(p) for p in paths], batch_size=batch_size + ) + + async def _exists(self, path, **kwargs): + try: + await self._info(path, **kwargs) + return True + except FileNotFoundError: + return False + + async def _info(self, path, **kwargs): + raise NotImplementedError + + async def _ls(self, path, detail=True, **kwargs): + raise NotImplementedError + + async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + path = self._strip_protocol(path) + full_dirs = {} + dirs = {} + files = {} + + detail = kwargs.pop("detail", False) + try: + listing = await self._ls(path, detail=True, **kwargs) + except (FileNotFoundError, OSError) as e: + if on_error == "raise": + raise + elif callable(on_error): + on_error(e) + if detail: + yield path, {}, {} + else: + yield path, [], [] + return + + for info in listing: + # each info name must be at least [path]/part , but here + # we check also for names like [path]/part/ + pathname = info["name"].rstrip("/") + name = pathname.rsplit("/", 1)[-1] + if info["type"] == "directory" and pathname != path: + # do not include "self" path + full_dirs[name] = pathname + dirs[name] = info + elif pathname == path: + # file-like with same name as give path + files[""] = info + else: + files[name] = info + + if detail: + yield path, dirs, files + else: + yield path, list(dirs), list(files) + + if maxdepth is not None: + maxdepth -= 1 + if maxdepth < 1: + return + + for d in dirs: + async for _ in self._walk( + full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs + ): + yield _ + + async def _glob(self, path, maxdepth=None, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + import re + + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_qmark, idx_brace) + + detail = kwargs.pop("detail", False) + + if not has_magic(path): + if await self._exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: await self._info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = await self._find( + root, maxdepth=depth, withdirs=True, detail=True, **kwargs + ) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) + pattern = re.compile(pattern) + + out = { + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + } + + if detail: + return out + else: + return list(out) + + async def _du(self, path, total=True, maxdepth=None, **kwargs): + sizes = {} + # async for? + for f in await self._find(path, maxdepth=maxdepth, **kwargs): + info = await self._info(f) + sizes[info["name"]] = info["size"] + if total: + return sum(sizes.values()) + else: + return sizes + + async def _find(self, path, maxdepth=None, withdirs=False, **kwargs): + path = self._strip_protocol(path) + out = {} + detail = kwargs.pop("detail", False) + + # Add the root directory if withdirs is requested + # This is needed for posix glob compliance + if withdirs and path != "" and await self._isdir(path): + out[path] = await self._info(path) + + # async for? + async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs): + if withdirs: + files.update(dirs) + out.update({info["name"]: info for name, info in files.items()}) + if not out and (await self._isfile(path)): + # walk works on directories, but find should also return [path] + # when path happens to be a file + out[path] = {} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} + + async def _expand_path(self, path, recursive=False, maxdepth=None): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + if isinstance(path, str): + out = await self._expand_path([path], recursive, maxdepth) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: # can gather here + if has_magic(p): + bit = set(await self._glob(p, maxdepth=maxdepth)) + out |= bit + if recursive: + # glob call above expanded one depth so if maxdepth is defined + # then decrement it in expand_path call below. If it is zero + # after decrementing then avoid expand_path call. + if maxdepth is not None and maxdepth <= 1: + continue + out |= set( + await self._expand_path( + list(bit), + recursive=recursive, + maxdepth=maxdepth - 1 if maxdepth is not None else None, + ) + ) + continue + elif recursive: + rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True)) + out |= rec + if p not in out and (recursive is False or (await self._exists(p))): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return sorted(out) + + async def _mkdir(self, path, create_parents=True, **kwargs): + pass # not necessary to implement, may not have directories + + async def _makedirs(self, path, exist_ok=False): + pass # not necessary to implement, may not have directories + + async def open_async(self, path, mode="rb", **kwargs): + if "b" not in mode or kwargs.get("compression"): + raise ValueError + raise NotImplementedError + + +def mirror_sync_methods(obj): + """Populate sync and async methods for obj + + For each method will create a sync version if the name refers to an async method + (coroutine) and there is no override in the child class; will create an async + method for the corresponding sync method if there is no implementation. + + Uses the methods specified in + - async_methods: the set that an implementation is expected to provide + - default_async_methods: that can be derived from their sync version in + AbstractFileSystem + - AsyncFileSystem: async-specific default coroutines + """ + from fsspec import AbstractFileSystem + + for method in async_methods + dir(AsyncFileSystem): + if not method.startswith("_"): + continue + smethod = method[1:] + if private.match(method): + isco = inspect.iscoroutinefunction(getattr(obj, method, None)) + unsync = getattr(getattr(obj, smethod, False), "__func__", None) + is_default = unsync is getattr(AbstractFileSystem, smethod, "") + if isco and is_default: + mth = sync_wrapper(getattr(obj, method), obj=obj) + setattr(obj, smethod, mth) + if not mth.__doc__: + mth.__doc__ = getattr( + getattr(AbstractFileSystem, smethod, None), "__doc__", "" + ) + + +class FSSpecCoroutineCancel(Exception): + pass + + +def _dump_running_tasks( + printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False +): + import traceback + + tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()] + if printout: + [task.print_stack() for task in tasks] + out = [ + { + "locals": task._coro.cr_frame.f_locals, + "file": task._coro.cr_frame.f_code.co_filename, + "firstline": task._coro.cr_frame.f_code.co_firstlineno, + "linelo": task._coro.cr_frame.f_lineno, + "stack": traceback.format_stack(task._coro.cr_frame), + "task": task if with_task else None, + } + for task in tasks + ] + if cancel: + for t in tasks: + cbs = t._callbacks + t.cancel() + asyncio.futures.Future.set_exception(t, exc) + asyncio.futures.Future.cancel(t) + [cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures + try: + t._coro.throw(exc) # exits coro, unless explicitly handled + except exc: + pass + return out + + +class AbstractAsyncStreamedFile(AbstractBufferedFile): + # no read buffering, and always auto-commit + # TODO: readahead might still be useful here, but needs async version + + async def read(self, length=-1): + """ + Return data from cache, or fetch pieces as necessary + + Parameters + ---------- + length: int (-1) + Number of bytes to read; if <0, all remaining bytes. + """ + length = -1 if length is None else int(length) + if self.mode != "rb": + raise ValueError("File not in read mode") + if length < 0: + length = self.size - self.loc + if self.closed: + raise ValueError("I/O operation on closed file.") + if length == 0: + # don't even bother calling fetch + return b"" + out = await self._fetch_range(self.loc, self.loc + length) + self.loc += len(out) + return out + + async def write(self, data): + """ + Write data to buffer. + + Buffer only sent on flush() or if buffer is greater than + or equal to blocksize. + + Parameters + ---------- + data: bytes + Set of bytes to be written. + """ + if self.mode not in {"wb", "ab"}: + raise ValueError("File not in write mode") + if self.closed: + raise ValueError("I/O operation on closed file.") + if self.forced: + raise ValueError("This file has been force-flushed, can only close") + out = self.buffer.write(data) + self.loc += out + if self.buffer.tell() >= self.blocksize: + await self.flush() + return out + + async def close(self): + """Close file + + Finalizes writes, discards cache + """ + if getattr(self, "_unclosable", False): + return + if self.closed: + return + if self.mode == "rb": + self.cache = None + else: + if not self.forced: + await self.flush(force=True) + + if self.fs is not None: + self.fs.invalidate_cache(self.path) + self.fs.invalidate_cache(self.fs._parent(self.path)) + + self.closed = True + + async def flush(self, force=False): + if self.closed: + raise ValueError("Flush on closed file") + if force and self.forced: + raise ValueError("Force flush cannot be called more than once") + if force: + self.forced = True + + if self.mode not in {"wb", "ab"}: + # no-op to flush on read-mode + return + + if not force and self.buffer.tell() < self.blocksize: + # Defer write on small block + return + + if self.offset is None: + # Initialize a multipart upload + self.offset = 0 + try: + await self._initiate_upload() + except: + self.closed = True + raise + + if await self._upload_chunk(final=force) is not False: + self.offset += self.buffer.seek(0, 2) + self.buffer = io.BytesIO() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def _fetch_range(self, start, end): + raise NotImplementedError + + async def _initiate_upload(self): + pass + + async def _upload_chunk(self, final=False): + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/fsspec/caching.py b/.venv/lib/python3.11/site-packages/fsspec/caching.py new file mode 100644 index 0000000000000000000000000000000000000000..34231c881e2b09bdc1c04e3969f1d68405a7af02 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/caching.py @@ -0,0 +1,966 @@ +from __future__ import annotations + +import collections +import functools +import logging +import math +import os +import threading +import warnings +from concurrent.futures import Future, ThreadPoolExecutor +from itertools import groupby +from operator import itemgetter +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Generic, + NamedTuple, + Optional, + OrderedDict, + TypeVar, +) + +if TYPE_CHECKING: + import mmap + + from typing_extensions import ParamSpec + + P = ParamSpec("P") +else: + P = TypeVar("P") + +T = TypeVar("T") + + +logger = logging.getLogger("fsspec") + +Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes + + +class BaseCache: + """Pass-though cache: doesn't keep anything, calls every time + + Acts as base class for other cachers + + Parameters + ---------- + blocksize: int + How far to read ahead in numbers of bytes + fetcher: func + Function of the form f(start, end) which gets bytes from remote as + specified + size: int + How big this file is + """ + + name: ClassVar[str] = "none" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + self.blocksize = blocksize + self.nblocks = 0 + self.fetcher = fetcher + self.size = size + self.hit_count = 0 + self.miss_count = 0 + # the bytes that we actually requested + self.total_requested_bytes = 0 + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + if start is None: + start = 0 + if stop is None: + stop = self.size + if start >= self.size or start >= stop: + return b"" + return self.fetcher(start, stop) + + def _reset_stats(self) -> None: + """Reset hit and miss counts for a more ganular report e.g. by file.""" + self.hit_count = 0 + self.miss_count = 0 + self.total_requested_bytes = 0 + + def _log_stats(self) -> str: + """Return a formatted string of the cache statistics.""" + if self.hit_count == 0 and self.miss_count == 0: + # a cache that does nothing, this is for logs only + return "" + return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes" + + def __repr__(self) -> str: + # TODO: use rich for better formatting + return f""" + <{self.__class__.__name__}: + block size : {self.blocksize} + block count : {self.nblocks} + file size : {self.size} + cache hits : {self.hit_count} + cache misses: {self.miss_count} + total requested bytes: {self.total_requested_bytes}> + """ + + +class MMapCache(BaseCache): + """memory-mapped sparse file cache + + Opens temporary file, which is filled blocks-wise when data is requested. + Ensure there is enough disc space in the temporary location. + + This cache method might only work on posix + """ + + name = "mmap" + + def __init__( + self, + blocksize: int, + fetcher: Fetcher, + size: int, + location: str | None = None, + blocks: set[int] | None = None, + ) -> None: + super().__init__(blocksize, fetcher, size) + self.blocks = set() if blocks is None else blocks + self.location = location + self.cache = self._makefile() + + def _makefile(self) -> mmap.mmap | bytearray: + import mmap + import tempfile + + if self.size == 0: + return bytearray() + + # posix version + if self.location is None or not os.path.exists(self.location): + if self.location is None: + fd = tempfile.TemporaryFile() + self.blocks = set() + else: + fd = open(self.location, "wb+") + fd.seek(self.size - 1) + fd.write(b"1") + fd.flush() + else: + fd = open(self.location, "r+b") + + return mmap.mmap(fd.fileno(), self.size) + + def _fetch(self, start: int | None, end: int | None) -> bytes: + logger.debug(f"MMap cache fetching {start}-{end}") + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + start_block = start // self.blocksize + end_block = end // self.blocksize + block_range = range(start_block, end_block + 1) + # Determine which blocks need to be fetched. This sequence is sorted by construction. + need = (i for i in block_range if i not in self.blocks) + # Count the number of blocks already cached + self.hit_count += sum(1 for i in block_range if i in self.blocks) + + # Consolidate needed blocks. + # Algorithm adapted from Python 2.x itertools documentation. + # We are grouping an enumerated sequence of blocks. By comparing when the difference + # between an ascending range (provided by enumerate) and the needed block numbers + # we can detect when the block number skips values. The key computes this difference. + # Whenever the difference changes, we know that we have previously cached block(s), + # and a new group is started. In other words, this algorithm neatly groups + # runs of consecutive block numbers so they can be fetched together. + for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]): + # Extract the blocks from the enumerated sequence + _blocks = tuple(map(itemgetter(1), _blocks)) + # Compute start of first block + sstart = _blocks[0] * self.blocksize + # Compute the end of the last block. Last block may not be full size. + send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size) + + # Fetch bytes (could be multiple consecutive blocks) + self.total_requested_bytes += send - sstart + logger.debug( + f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})" + ) + self.cache[sstart:send] = self.fetcher(sstart, send) + + # Update set of cached blocks + self.blocks.update(_blocks) + # Update cache statistics with number of blocks we had to cache + self.miss_count += len(_blocks) + + return self.cache[start:end] + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__.copy() + # Remove the unpicklable entries. + del state["cache"] + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + # Restore instance attributes + self.__dict__.update(state) + self.cache = self._makefile() + + +class ReadAheadCache(BaseCache): + """Cache which reads only when we get beyond a block of data + + This is a much simpler version of BytesCache, and does not attempt to + fill holes in the cache or keep fragments alive. It is best suited to + many small reads in a sequential order (e.g., reading lines from a file). + """ + + name = "readahead" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + super().__init__(blocksize, fetcher, size) + self.cache = b"" + self.start = 0 + self.end = 0 + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None or end > self.size: + end = self.size + if start >= self.size or start >= end: + return b"" + l = end - start + if start >= self.start and end <= self.end: + # cache hit + self.hit_count += 1 + return self.cache[start - self.start : end - self.start] + elif self.start <= start < self.end: + # partial hit + self.miss_count += 1 + part = self.cache[start - self.start :] + l -= len(part) + start = self.end + else: + # miss + self.miss_count += 1 + part = b"" + end = min(self.size, end + self.blocksize) + self.total_requested_bytes += end - start + self.cache = self.fetcher(start, end) # new block replaces old + self.start = start + self.end = self.start + len(self.cache) + return part + self.cache[:l] + + +class FirstChunkCache(BaseCache): + """Caches the first block of a file only + + This may be useful for file types where the metadata is stored in the header, + but is randomly accessed. + """ + + name = "first" + + def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None: + if blocksize > size: + # this will buffer the whole thing + blocksize = size + super().__init__(blocksize, fetcher, size) + self.cache: bytes | None = None + + def _fetch(self, start: int | None, end: int | None) -> bytes: + start = start or 0 + if start > self.size: + logger.debug("FirstChunkCache: requested start > file size") + return b"" + + end = min(end, self.size) + + if start < self.blocksize: + if self.cache is None: + self.miss_count += 1 + if end > self.blocksize: + self.total_requested_bytes += end + data = self.fetcher(0, end) + self.cache = data[: self.blocksize] + return data[start:] + self.cache = self.fetcher(0, self.blocksize) + self.total_requested_bytes += self.blocksize + part = self.cache[start:end] + if end > self.blocksize: + self.total_requested_bytes += end - self.blocksize + part += self.fetcher(self.blocksize, end) + self.hit_count += 1 + return part + else: + self.miss_count += 1 + self.total_requested_bytes += end - start + return self.fetcher(start, end) + + +class BlockCache(BaseCache): + """ + Cache holding memory as a set of blocks. + + Requests are only ever made ``blocksize`` at a time, and are + stored in an LRU cache. The least recently accessed block is + discarded when more than ``maxblocks`` are stored. + + Parameters + ---------- + blocksize : int + The number of bytes to store in each block. + Requests are only ever made for ``blocksize``, so this + should balance the overhead of making a request against + the granularity of the blocks. + fetcher : Callable + size : int + The total size of the file being cached. + maxblocks : int + The maximum number of blocks to cache for. The maximum memory + use for this cache is then ``blocksize * maxblocks``. + """ + + name = "blockcache" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32 + ) -> None: + super().__init__(blocksize, fetcher, size) + self.nblocks = math.ceil(size / blocksize) + self.maxblocks = maxblocks + self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) + + def cache_info(self): + """ + The statistics on the block cache. + + Returns + ------- + NamedTuple + Returned directly from the LRU Cache used internally. + """ + return self._fetch_block_cached.cache_info() + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__ + del state["_fetch_block_cached"] + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + self.__dict__.update(state) + self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( + self._fetch_block + ) + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + + # byte position -> block numbers + start_block_number = start // self.blocksize + end_block_number = end // self.blocksize + + # these are cached, so safe to do multiple calls for the same start and end. + for block_number in range(start_block_number, end_block_number + 1): + self._fetch_block_cached(block_number) + + return self._read_cache( + start, + end, + start_block_number=start_block_number, + end_block_number=end_block_number, + ) + + def _fetch_block(self, block_number: int) -> bytes: + """ + Fetch the block of data for `block_number`. + """ + if block_number > self.nblocks: + raise ValueError( + f"'block_number={block_number}' is greater than " + f"the number of blocks ({self.nblocks})" + ) + + start = block_number * self.blocksize + end = start + self.blocksize + self.total_requested_bytes += end - start + self.miss_count += 1 + logger.info("BlockCache fetching block %d", block_number) + block_contents = super()._fetch(start, end) + return block_contents + + def _read_cache( + self, start: int, end: int, start_block_number: int, end_block_number: int + ) -> bytes: + """ + Read from our block cache. + + Parameters + ---------- + start, end : int + The start and end byte positions. + start_block_number, end_block_number : int + The start and end block numbers. + """ + start_pos = start % self.blocksize + end_pos = end % self.blocksize + + self.hit_count += 1 + if start_block_number == end_block_number: + block: bytes = self._fetch_block_cached(start_block_number) + return block[start_pos:end_pos] + + else: + # read from the initial + out = [self._fetch_block_cached(start_block_number)[start_pos:]] + + # intermediate blocks + # Note: it'd be nice to combine these into one big request. However + # that doesn't play nicely with our LRU cache. + out.extend( + map( + self._fetch_block_cached, + range(start_block_number + 1, end_block_number), + ) + ) + + # final block + out.append(self._fetch_block_cached(end_block_number)[:end_pos]) + + return b"".join(out) + + +class BytesCache(BaseCache): + """Cache which holds data in a in-memory bytes object + + Implements read-ahead by the block size, for semi-random reads progressing + through the file. + + Parameters + ---------- + trim: bool + As we read more data, whether to discard the start of the buffer when + we are more than a blocksize ahead of it. + """ + + name: ClassVar[str] = "bytes" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True + ) -> None: + super().__init__(blocksize, fetcher, size) + self.cache = b"" + self.start: int | None = None + self.end: int | None = None + self.trim = trim + + def _fetch(self, start: int | None, end: int | None) -> bytes: + # TODO: only set start/end after fetch, in case it fails? + # is this where retry logic might go? + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + if ( + self.start is not None + and start >= self.start + and self.end is not None + and end < self.end + ): + # cache hit: we have all the required data + offset = start - self.start + self.hit_count += 1 + return self.cache[offset : offset + end - start] + + if self.blocksize: + bend = min(self.size, end + self.blocksize) + else: + bend = end + + if bend == start or start > self.size: + return b"" + + if (self.start is None or start < self.start) and ( + self.end is None or end > self.end + ): + # First read, or extending both before and after + self.total_requested_bytes += bend - start + self.miss_count += 1 + self.cache = self.fetcher(start, bend) + self.start = start + else: + assert self.start is not None + assert self.end is not None + self.miss_count += 1 + + if start < self.start: + if self.end is None or self.end - end > self.blocksize: + self.total_requested_bytes += bend - start + self.cache = self.fetcher(start, bend) + self.start = start + else: + self.total_requested_bytes += self.start - start + new = self.fetcher(start, self.start) + self.start = start + self.cache = new + self.cache + elif self.end is not None and bend > self.end: + if self.end > self.size: + pass + elif end - self.end > self.blocksize: + self.total_requested_bytes += bend - start + self.cache = self.fetcher(start, bend) + self.start = start + else: + self.total_requested_bytes += bend - self.end + new = self.fetcher(self.end, bend) + self.cache = self.cache + new + + self.end = self.start + len(self.cache) + offset = start - self.start + out = self.cache[offset : offset + end - start] + if self.trim: + num = (self.end - self.start) // (self.blocksize + 1) + if num > 1: + self.start += self.blocksize * num + self.cache = self.cache[self.blocksize * num :] + return out + + def __len__(self) -> int: + return len(self.cache) + + +class AllBytes(BaseCache): + """Cache entire contents of the file""" + + name: ClassVar[str] = "all" + + def __init__( + self, + blocksize: int | None = None, + fetcher: Fetcher | None = None, + size: int | None = None, + data: bytes | None = None, + ) -> None: + super().__init__(blocksize, fetcher, size) # type: ignore[arg-type] + if data is None: + self.miss_count += 1 + self.total_requested_bytes += self.size + data = self.fetcher(0, self.size) + self.data = data + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + self.hit_count += 1 + return self.data[start:stop] + + +class KnownPartsOfAFile(BaseCache): + """ + Cache holding known file parts. + + Parameters + ---------- + blocksize: int + How far to read ahead in numbers of bytes + fetcher: func + Function of the form f(start, end) which gets bytes from remote as + specified + size: int + How big this file is + data: dict + A dictionary mapping explicit `(start, stop)` file-offset tuples + with known bytes. + strict: bool, default True + Whether to fetch reads that go beyond a known byte-range boundary. + If `False`, any read that ends outside a known part will be zero + padded. Note that zero padding will not be used for reads that + begin outside a known byte-range. + """ + + name: ClassVar[str] = "parts" + + def __init__( + self, + blocksize: int, + fetcher: Fetcher, + size: int, + data: Optional[dict[tuple[int, int], bytes]] = None, + strict: bool = True, + **_: Any, + ): + super().__init__(blocksize, fetcher, size) + self.strict = strict + + # simple consolidation of contiguous blocks + if data: + old_offsets = sorted(data.keys()) + offsets = [old_offsets[0]] + blocks = [data.pop(old_offsets[0])] + for start, stop in old_offsets[1:]: + start0, stop0 = offsets[-1] + if start == stop0: + offsets[-1] = (start0, stop) + blocks[-1] += data.pop((start, stop)) + else: + offsets.append((start, stop)) + blocks.append(data.pop((start, stop))) + + self.data = dict(zip(offsets, blocks)) + else: + self.data = {} + + def _fetch(self, start: int | None, stop: int | None) -> bytes: + if start is None: + start = 0 + if stop is None: + stop = self.size + + out = b"" + for (loc0, loc1), data in self.data.items(): + # If self.strict=False, use zero-padded data + # for reads beyond the end of a "known" buffer + if loc0 <= start < loc1: + off = start - loc0 + out = data[off : off + stop - start] + if not self.strict or loc0 <= stop <= loc1: + # The request is within a known range, or + # it begins within a known range, and we + # are allowed to pad reads beyond the + # buffer with zero + out += b"\x00" * (stop - start - len(out)) + self.hit_count += 1 + return out + else: + # The request ends outside a known range, + # and we are being "strict" about reads + # beyond the buffer + start = loc1 + break + + # We only get here if there is a request outside the + # known parts of the file. In an ideal world, this + # should never happen + if self.fetcher is None: + # We cannot fetch the data, so raise an error + raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ") + # We can fetch the data, but should warn the user + # that this may be slow + warnings.warn( + f"Read is outside the known file parts: {(start, stop)}. " + f"IO/caching performance may be poor!" + ) + logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}") + self.total_requested_bytes += stop - start + self.miss_count += 1 + return out + super()._fetch(start, stop) + + +class UpdatableLRU(Generic[P, T]): + """ + Custom implementation of LRU cache that allows updating keys + + Used by BackgroudBlockCache + """ + + class CacheInfo(NamedTuple): + hits: int + misses: int + maxsize: int + currsize: int + + def __init__(self, func: Callable[P, T], max_size: int = 128) -> None: + self._cache: OrderedDict[Any, T] = collections.OrderedDict() + self._func = func + self._max_size = max_size + self._hits = 0 + self._misses = 0 + self._lock = threading.Lock() + + def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T: + if kwargs: + raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}") + with self._lock: + if args in self._cache: + self._cache.move_to_end(args) + self._hits += 1 + return self._cache[args] + + result = self._func(*args, **kwargs) + + with self._lock: + self._cache[args] = result + self._misses += 1 + if len(self._cache) > self._max_size: + self._cache.popitem(last=False) + + return result + + def is_key_cached(self, *args: Any) -> bool: + with self._lock: + return args in self._cache + + def add_key(self, result: T, *args: Any) -> None: + with self._lock: + self._cache[args] = result + if len(self._cache) > self._max_size: + self._cache.popitem(last=False) + + def cache_info(self) -> UpdatableLRU.CacheInfo: + with self._lock: + return self.CacheInfo( + maxsize=self._max_size, + currsize=len(self._cache), + hits=self._hits, + misses=self._misses, + ) + + +class BackgroundBlockCache(BaseCache): + """ + Cache holding memory as a set of blocks with pre-loading of + the next block in the background. + + Requests are only ever made ``blocksize`` at a time, and are + stored in an LRU cache. The least recently accessed block is + discarded when more than ``maxblocks`` are stored. If the + next block is not in cache, it is loaded in a separate thread + in non-blocking way. + + Parameters + ---------- + blocksize : int + The number of bytes to store in each block. + Requests are only ever made for ``blocksize``, so this + should balance the overhead of making a request against + the granularity of the blocks. + fetcher : Callable + size : int + The total size of the file being cached. + maxblocks : int + The maximum number of blocks to cache for. The maximum memory + use for this cache is then ``blocksize * maxblocks``. + """ + + name: ClassVar[str] = "background" + + def __init__( + self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32 + ) -> None: + super().__init__(blocksize, fetcher, size) + self.nblocks = math.ceil(size / blocksize) + self.maxblocks = maxblocks + self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks) + + self._thread_executor = ThreadPoolExecutor(max_workers=1) + self._fetch_future_block_number: int | None = None + self._fetch_future: Future[bytes] | None = None + self._fetch_future_lock = threading.Lock() + + def cache_info(self) -> UpdatableLRU.CacheInfo: + """ + The statistics on the block cache. + + Returns + ------- + NamedTuple + Returned directly from the LRU Cache used internally. + """ + return self._fetch_block_cached.cache_info() + + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__ + del state["_fetch_block_cached"] + del state["_thread_executor"] + del state["_fetch_future_block_number"] + del state["_fetch_future"] + del state["_fetch_future_lock"] + return state + + def __setstate__(self, state) -> None: + self.__dict__.update(state) + self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"]) + self._thread_executor = ThreadPoolExecutor(max_workers=1) + self._fetch_future_block_number = None + self._fetch_future = None + self._fetch_future_lock = threading.Lock() + + def _fetch(self, start: int | None, end: int | None) -> bytes: + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" + + # byte position -> block numbers + start_block_number = start // self.blocksize + end_block_number = end // self.blocksize + + fetch_future_block_number = None + fetch_future = None + with self._fetch_future_lock: + # Background thread is running. Check we we can or must join it. + if self._fetch_future is not None: + assert self._fetch_future_block_number is not None + if self._fetch_future.done(): + logger.info("BlockCache joined background fetch without waiting.") + self._fetch_block_cached.add_key( + self._fetch_future.result(), self._fetch_future_block_number + ) + # Cleanup the fetch variables. Done with fetching the block. + self._fetch_future_block_number = None + self._fetch_future = None + else: + # Must join if we need the block for the current fetch + must_join = bool( + start_block_number + <= self._fetch_future_block_number + <= end_block_number + ) + if must_join: + # Copy to the local variables to release lock + # before waiting for result + fetch_future_block_number = self._fetch_future_block_number + fetch_future = self._fetch_future + + # Cleanup the fetch variables. Have a local copy. + self._fetch_future_block_number = None + self._fetch_future = None + + # Need to wait for the future for the current read + if fetch_future is not None: + logger.info("BlockCache waiting for background fetch.") + # Wait until result and put it in cache + self._fetch_block_cached.add_key( + fetch_future.result(), fetch_future_block_number + ) + + # these are cached, so safe to do multiple calls for the same start and end. + for block_number in range(start_block_number, end_block_number + 1): + self._fetch_block_cached(block_number) + + # fetch next block in the background if nothing is running in the background, + # the block is within file and it is not already cached + end_block_plus_1 = end_block_number + 1 + with self._fetch_future_lock: + if ( + self._fetch_future is None + and end_block_plus_1 <= self.nblocks + and not self._fetch_block_cached.is_key_cached(end_block_plus_1) + ): + self._fetch_future_block_number = end_block_plus_1 + self._fetch_future = self._thread_executor.submit( + self._fetch_block, end_block_plus_1, "async" + ) + + return self._read_cache( + start, + end, + start_block_number=start_block_number, + end_block_number=end_block_number, + ) + + def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes: + """ + Fetch the block of data for `block_number`. + """ + if block_number > self.nblocks: + raise ValueError( + f"'block_number={block_number}' is greater than " + f"the number of blocks ({self.nblocks})" + ) + + start = block_number * self.blocksize + end = start + self.blocksize + logger.info("BlockCache fetching block (%s) %d", log_info, block_number) + self.total_requested_bytes += end - start + self.miss_count += 1 + block_contents = super()._fetch(start, end) + return block_contents + + def _read_cache( + self, start: int, end: int, start_block_number: int, end_block_number: int + ) -> bytes: + """ + Read from our block cache. + + Parameters + ---------- + start, end : int + The start and end byte positions. + start_block_number, end_block_number : int + The start and end block numbers. + """ + start_pos = start % self.blocksize + end_pos = end % self.blocksize + + # kind of pointless to count this as a hit, but it is + self.hit_count += 1 + + if start_block_number == end_block_number: + block = self._fetch_block_cached(start_block_number) + return block[start_pos:end_pos] + + else: + # read from the initial + out = [self._fetch_block_cached(start_block_number)[start_pos:]] + + # intermediate blocks + # Note: it'd be nice to combine these into one big request. However + # that doesn't play nicely with our LRU cache. + out.extend( + map( + self._fetch_block_cached, + range(start_block_number + 1, end_block_number), + ) + ) + + # final block + out.append(self._fetch_block_cached(end_block_number)[:end_pos]) + + return b"".join(out) + + +caches: dict[str | None, type[BaseCache]] = { + # one custom case + None: BaseCache, +} + + +def register_cache(cls: type[BaseCache], clobber: bool = False) -> None: + """'Register' cache implementation. + + Parameters + ---------- + clobber: bool, optional + If set to True (default is False) - allow to overwrite existing + entry. + + Raises + ------ + ValueError + """ + name = cls.name + if not clobber and name in caches: + raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}") + caches[name] = cls + + +for c in ( + BaseCache, + MMapCache, + BytesCache, + ReadAheadCache, + BlockCache, + FirstChunkCache, + AllBytes, + KnownPartsOfAFile, + BackgroundBlockCache, +): + register_cache(c) diff --git a/.venv/lib/python3.11/site-packages/fsspec/callbacks.py b/.venv/lib/python3.11/site-packages/fsspec/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..7ca99ca6ac3cd69b28bcd1550f6550e8e648c5fe --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/callbacks.py @@ -0,0 +1,324 @@ +from functools import wraps + + +class Callback: + """ + Base class and interface for callback mechanism + + This class can be used directly for monitoring file transfers by + providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument, + below), or subclassed for more specialised behaviour. + + Parameters + ---------- + size: int (optional) + Nominal quantity for the value that corresponds to a complete + transfer, e.g., total number of tiles or total number of + bytes + value: int (0) + Starting internal counter value + hooks: dict or None + A dict of named functions to be called on each update. The signature + of these must be ``f(size, value, **kwargs)`` + """ + + def __init__(self, size=None, value=0, hooks=None, **kwargs): + self.size = size + self.value = value + self.hooks = hooks or {} + self.kw = kwargs + + def __enter__(self): + return self + + def __exit__(self, *exc_args): + self.close() + + def close(self): + """Close callback.""" + + def branched(self, path_1, path_2, **kwargs): + """ + Return callback for child transfers + + If this callback is operating at a higher level, e.g., put, which may + trigger transfers that can also be monitored. The function returns a callback + that has to be passed to the child method, e.g., put_file, + as `callback=` argument. + + The implementation uses `callback.branch` for compatibility. + When implementing callbacks, it is recommended to override this function instead + of `branch` and avoid calling `super().branched(...)`. + + Prefer using this function over `branch`. + + Parameters + ---------- + path_1: str + Child's source path + path_2: str + Child's destination path + **kwargs: + Arbitrary keyword arguments + + Returns + ------- + callback: Callback + A callback instance to be passed to the child method + """ + self.branch(path_1, path_2, kwargs) + # mutate kwargs so that we can force the caller to pass "callback=" explicitly + return kwargs.pop("callback", DEFAULT_CALLBACK) + + def branch_coro(self, fn): + """ + Wraps a coroutine, and pass a new child callback to it. + """ + + @wraps(fn) + async def func(path1, path2: str, **kwargs): + with self.branched(path1, path2, **kwargs) as child: + return await fn(path1, path2, callback=child, **kwargs) + + return func + + def set_size(self, size): + """ + Set the internal maximum size attribute + + Usually called if not initially set at instantiation. Note that this + triggers a ``call()``. + + Parameters + ---------- + size: int + """ + self.size = size + self.call() + + def absolute_update(self, value): + """ + Set the internal value state + + Triggers ``call()`` + + Parameters + ---------- + value: int + """ + self.value = value + self.call() + + def relative_update(self, inc=1): + """ + Delta increment the internal counter + + Triggers ``call()`` + + Parameters + ---------- + inc: int + """ + self.value += inc + self.call() + + def call(self, hook_name=None, **kwargs): + """ + Execute hook(s) with current state + + Each function is passed the internal size and current value + + Parameters + ---------- + hook_name: str or None + If given, execute on this hook + kwargs: passed on to (all) hook(s) + """ + if not self.hooks: + return + kw = self.kw.copy() + kw.update(kwargs) + if hook_name: + if hook_name not in self.hooks: + return + return self.hooks[hook_name](self.size, self.value, **kw) + for hook in self.hooks.values() or []: + hook(self.size, self.value, **kw) + + def wrap(self, iterable): + """ + Wrap an iterable to call ``relative_update`` on each iterations + + Parameters + ---------- + iterable: Iterable + The iterable that is being wrapped + """ + for item in iterable: + self.relative_update() + yield item + + def branch(self, path_1, path_2, kwargs): + """ + Set callbacks for child transfers + + If this callback is operating at a higher level, e.g., put, which may + trigger transfers that can also be monitored. The passed kwargs are + to be *mutated* to add ``callback=``, if this class supports branching + to children. + + Parameters + ---------- + path_1: str + Child's source path + path_2: str + Child's destination path + kwargs: dict + arguments passed to child method, e.g., put_file. + + Returns + ------- + + """ + return None + + def no_op(self, *_, **__): + pass + + def __getattr__(self, item): + """ + If undefined methods are called on this class, nothing happens + """ + return self.no_op + + @classmethod + def as_callback(cls, maybe_callback=None): + """Transform callback=... into Callback instance + + For the special value of ``None``, return the global instance of + ``NoOpCallback``. This is an alternative to including + ``callback=DEFAULT_CALLBACK`` directly in a method signature. + """ + if maybe_callback is None: + return DEFAULT_CALLBACK + return maybe_callback + + +class NoOpCallback(Callback): + """ + This implementation of Callback does exactly nothing + """ + + def call(self, *args, **kwargs): + return None + + +class DotPrinterCallback(Callback): + """ + Simple example Callback implementation + + Almost identical to Callback with a hook that prints a char; here we + demonstrate how the outer layer may print "#" and the inner layer "." + """ + + def __init__(self, chr_to_print="#", **kwargs): + self.chr = chr_to_print + super().__init__(**kwargs) + + def branch(self, path_1, path_2, kwargs): + """Mutate kwargs to add new instance with different print char""" + kwargs["callback"] = DotPrinterCallback(".") + + def call(self, **kwargs): + """Just outputs a character""" + print(self.chr, end="") + + +class TqdmCallback(Callback): + """ + A callback to display a progress bar using tqdm + + Parameters + ---------- + tqdm_kwargs : dict, (optional) + Any argument accepted by the tqdm constructor. + See the `tqdm doc `_. + Will be forwarded to `tqdm_cls`. + tqdm_cls: (optional) + subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`. + + Examples + -------- + >>> import fsspec + >>> from fsspec.callbacks import TqdmCallback + >>> fs = fsspec.filesystem("memory") + >>> path2distant_data = "/your-path" + >>> fs.upload( + ".", + path2distant_data, + recursive=True, + callback=TqdmCallback(), + ) + + You can forward args to tqdm using the ``tqdm_kwargs`` parameter. + + >>> fs.upload( + ".", + path2distant_data, + recursive=True, + callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}), + ) + + You can also customize the progress bar by passing a subclass of `tqdm`. + + .. code-block:: python + + class TqdmFormat(tqdm): + '''Provides a `total_time` format parameter''' + @property + def format_dict(self): + d = super().format_dict + total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) + d.update(total_time=self.format_interval(total_time) + " in total") + return d + + >>> with TqdmCallback( + tqdm_kwargs={ + "desc": "desc", + "bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}", + }, + tqdm_cls=TqdmFormat, + ) as callback: + fs.upload(".", path2distant_data, recursive=True, callback=callback) + """ + + def __init__(self, tqdm_kwargs=None, *args, **kwargs): + try: + from tqdm import tqdm + + except ImportError as exce: + raise ImportError( + "Using TqdmCallback requires tqdm to be installed" + ) from exce + + self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm) + self._tqdm_kwargs = tqdm_kwargs or {} + self.tqdm = None + super().__init__(*args, **kwargs) + + def call(self, *args, **kwargs): + if self.tqdm is None: + self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs) + self.tqdm.total = self.size + self.tqdm.update(self.value - self.tqdm.n) + + def close(self): + if self.tqdm is not None: + self.tqdm.close() + self.tqdm = None + + def __del__(self): + return self.close() + + +DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback() diff --git a/.venv/lib/python3.11/site-packages/fsspec/compression.py b/.venv/lib/python3.11/site-packages/fsspec/compression.py new file mode 100644 index 0000000000000000000000000000000000000000..fc519c24532fdcd3f6f1b3fc645d38e1e1dde1d7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/compression.py @@ -0,0 +1,175 @@ +"""Helper functions for a standard streaming compression API""" + +from zipfile import ZipFile + +import fsspec.utils +from fsspec.spec import AbstractBufferedFile + + +def noop_file(file, mode, **kwargs): + return file + + +# TODO: files should also be available as contexts +# should be functions of the form func(infile, mode=, **kwargs) -> file-like +compr = {None: noop_file} + + +def register_compression(name, callback, extensions, force=False): + """Register an "inferable" file compression type. + + Registers transparent file compression type for use with fsspec.open. + Compression can be specified by name in open, or "infer"-ed for any files + ending with the given extensions. + + Args: + name: (str) The compression type name. Eg. "gzip". + callback: A callable of form (infile, mode, **kwargs) -> file-like. + Accepts an input file-like object, the target mode and kwargs. + Returns a wrapped file-like object. + extensions: (str, Iterable[str]) A file extension, or list of file + extensions for which to infer this compression scheme. Eg. "gz". + force: (bool) Force re-registration of compression type or extensions. + + Raises: + ValueError: If name or extensions already registered, and not force. + + """ + if isinstance(extensions, str): + extensions = [extensions] + + # Validate registration + if name in compr and not force: + raise ValueError(f"Duplicate compression registration: {name}") + + for ext in extensions: + if ext in fsspec.utils.compressions and not force: + raise ValueError(f"Duplicate compression file extension: {ext} ({name})") + + compr[name] = callback + + for ext in extensions: + fsspec.utils.compressions[ext] = name + + +def unzip(infile, mode="rb", filename=None, **kwargs): + if "r" not in mode: + filename = filename or "file" + z = ZipFile(infile, mode="w", **kwargs) + fo = z.open(filename, mode="w") + fo.close = lambda closer=fo.close: closer() or z.close() + return fo + z = ZipFile(infile) + if filename is None: + filename = z.namelist()[0] + return z.open(filename, mode="r", **kwargs) + + +register_compression("zip", unzip, "zip") + +try: + from bz2 import BZ2File +except ImportError: + pass +else: + register_compression("bz2", BZ2File, "bz2") + +try: # pragma: no cover + from isal import igzip + + def isal(infile, mode="rb", **kwargs): + return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs) + + register_compression("gzip", isal, "gz") +except ImportError: + from gzip import GzipFile + + register_compression( + "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz" + ) + +try: + from lzma import LZMAFile + + register_compression("lzma", LZMAFile, "lzma") + register_compression("xz", LZMAFile, "xz") +except ImportError: + pass + +try: + import lzmaffi + + register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True) + register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) +except ImportError: + pass + + +class SnappyFile(AbstractBufferedFile): + def __init__(self, infile, mode, **kwargs): + import snappy + + super().__init__( + fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs + ) + self.infile = infile + if "r" in mode: + self.codec = snappy.StreamDecompressor() + else: + self.codec = snappy.StreamCompressor() + + def _upload_chunk(self, final=False): + self.buffer.seek(0) + out = self.codec.add_chunk(self.buffer.read()) + self.infile.write(out) + return True + + def seek(self, loc, whence=0): + raise NotImplementedError("SnappyFile is not seekable") + + def seekable(self): + return False + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + data = self.infile.read(end - start) + return self.codec.decompress(data) + + +try: + import snappy + + snappy.compress(b"") + # Snappy may use the .sz file extension, but this is not part of the + # standard implementation. + register_compression("snappy", SnappyFile, []) + +except (ImportError, NameError, AttributeError): + pass + +try: + import lz4.frame + + register_compression("lz4", lz4.frame.open, "lz4") +except ImportError: + pass + +try: + import zstandard as zstd + + def zstandard_file(infile, mode="rb"): + if "r" in mode: + cctx = zstd.ZstdDecompressor() + return cctx.stream_reader(infile) + else: + cctx = zstd.ZstdCompressor(level=10) + return cctx.stream_writer(infile) + + register_compression("zstd", zstandard_file, "zst") +except ImportError: + pass + + +def available_compressions(): + """Return a list of the implemented compressions.""" + return list(compr) diff --git a/.venv/lib/python3.11/site-packages/fsspec/config.py b/.venv/lib/python3.11/site-packages/fsspec/config.py new file mode 100644 index 0000000000000000000000000000000000000000..76d9af14aaf7df47c4551c169f27b05abf9c269e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/config.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import configparser +import json +import os +import warnings +from typing import Any + +conf: dict[str, dict[str, Any]] = {} +default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec") +conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir) + + +def set_conf_env(conf_dict, envdict=os.environ): + """Set config values from environment variables + + Looks for variables of the form ``FSSPEC_`` and + ``FSSPEC__``. For ``FSSPEC_`` the value is parsed + as a json dictionary and used to ``update`` the config of the + corresponding protocol. For ``FSSPEC__`` there is no + attempt to convert the string value, but the kwarg keys will be lower-cased. + + The ``FSSPEC__`` variables are applied after the + ``FSSPEC_`` ones. + + Parameters + ---------- + conf_dict : dict(str, dict) + This dict will be mutated + envdict : dict-like(str, str) + Source for the values - usually the real environment + """ + kwarg_keys = [] + for key in envdict: + if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_": + if key.count("_") > 1: + kwarg_keys.append(key) + continue + try: + value = json.loads(envdict[key]) + except json.decoder.JSONDecodeError as ex: + warnings.warn( + f"Ignoring environment variable {key} due to a parse failure: {ex}" + ) + else: + if isinstance(value, dict): + _, proto = key.split("_", 1) + conf_dict.setdefault(proto.lower(), {}).update(value) + else: + warnings.warn( + f"Ignoring environment variable {key} due to not being a dict:" + f" {type(value)}" + ) + elif key.startswith("FSSPEC"): + warnings.warn( + f"Ignoring environment variable {key} due to having an unexpected name" + ) + + for key in kwarg_keys: + _, proto, kwarg = key.split("_", 2) + conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key] + + +def set_conf_files(cdir, conf_dict): + """Set config values from files + + Scans for INI and JSON files in the given dictionary, and uses their + contents to set the config. In case of repeated values, later values + win. + + In the case of INI files, all values are strings, and these will not + be converted. + + Parameters + ---------- + cdir : str + Directory to search + conf_dict : dict(str, dict) + This dict will be mutated + """ + if not os.path.isdir(cdir): + return + allfiles = sorted(os.listdir(cdir)) + for fn in allfiles: + if fn.endswith(".ini"): + ini = configparser.ConfigParser() + ini.read(os.path.join(cdir, fn)) + for key in ini: + if key == "DEFAULT": + continue + conf_dict.setdefault(key, {}).update(dict(ini[key])) + if fn.endswith(".json"): + with open(os.path.join(cdir, fn)) as f: + js = json.load(f) + for key in js: + conf_dict.setdefault(key, {}).update(dict(js[key])) + + +def apply_config(cls, kwargs, conf_dict=None): + """Supply default values for kwargs when instantiating class + + Augments the passed kwargs, by finding entries in the config dict + which match the classes ``.protocol`` attribute (one or more str) + + Parameters + ---------- + cls : file system implementation + kwargs : dict + conf_dict : dict of dict + Typically this is the global configuration + + Returns + ------- + dict : the modified set of kwargs + """ + if conf_dict is None: + conf_dict = conf + protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol] + kw = {} + for proto in protos: + # default kwargs from the current state of the config + if proto in conf_dict: + kw.update(conf_dict[proto]) + # explicit kwargs always win + kw.update(**kwargs) + kwargs = kw + return kwargs + + +set_conf_files(conf_dir, conf) +set_conf_env(conf) diff --git a/.venv/lib/python3.11/site-packages/fsspec/conftest.py b/.venv/lib/python3.11/site-packages/fsspec/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..6874a42c4895c3c7b973dc5d63fd4488a4e60b44 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/conftest.py @@ -0,0 +1,55 @@ +import os +import shutil +import subprocess +import sys +import time + +import pytest + +import fsspec +from fsspec.implementations.cached import CachingFileSystem + + +@pytest.fixture() +def m(): + """ + Fixture providing a memory filesystem. + """ + m = fsspec.filesystem("memory") + m.store.clear() + m.pseudo_dirs.clear() + m.pseudo_dirs.append("") + try: + yield m + finally: + m.store.clear() + m.pseudo_dirs.clear() + m.pseudo_dirs.append("") + + +@pytest.fixture +def ftp_writable(tmpdir): + """ + Fixture providing a writable FTP filesystem. + """ + pytest.importorskip("pyftpdlib") + from fsspec.implementations.ftp import FTPFileSystem + + FTPFileSystem.clear_instance_cache() # remove lingering connections + CachingFileSystem.clear_instance_cache() + d = str(tmpdir) + with open(os.path.join(d, "out"), "wb") as f: + f.write(b"hello" * 10000) + P = subprocess.Popen( + [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] + ) + try: + time.sleep(1) + yield "localhost", 2121, "user", "pass" + finally: + P.terminate() + P.wait() + try: + shutil.rmtree(tmpdir) + except Exception: + pass diff --git a/.venv/lib/python3.11/site-packages/fsspec/core.py b/.venv/lib/python3.11/site-packages/fsspec/core.py new file mode 100644 index 0000000000000000000000000000000000000000..1d3c04c6fd39251dabcb038543682cdfacf02feb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/core.py @@ -0,0 +1,743 @@ +from __future__ import annotations + +import io +import logging +import os +import re +from glob import has_magic +from pathlib import Path + +# for backwards compat, we export cache things from here too +from fsspec.caching import ( # noqa: F401 + BaseCache, + BlockCache, + BytesCache, + MMapCache, + ReadAheadCache, + caches, +) +from fsspec.compression import compr +from fsspec.config import conf +from fsspec.registry import filesystem, get_filesystem_class +from fsspec.utils import ( + _unstrip_protocol, + build_name_function, + infer_compression, + stringify_path, +) + +logger = logging.getLogger("fsspec") + + +class OpenFile: + """ + File-like object to be used in a context + + Can layer (buffered) text-mode and compression over any file-system, which + are typically binary-only. + + These instances are safe to serialize, as the low-level file object + is not created until invoked using ``with``. + + Parameters + ---------- + fs: FileSystem + The file system to use for opening the file. Should be a subclass or duck-type + with ``fsspec.spec.AbstractFileSystem`` + path: str + Location to open + mode: str like 'rb', optional + Mode of the opened file + compression: str or None, optional + Compression to apply + encoding: str or None, optional + The encoding to use if opened in text mode. + errors: str or None, optional + How to handle encoding errors if opened in text mode. + newline: None or str + Passed to TextIOWrapper in text mode, how to handle line endings. + autoopen: bool + If True, calls open() immediately. Mostly used by pickle + pos: int + If given and autoopen is True, seek to this location immediately + """ + + def __init__( + self, + fs, + path, + mode="rb", + compression=None, + encoding=None, + errors=None, + newline=None, + ): + self.fs = fs + self.path = path + self.mode = mode + self.compression = get_compression(path, compression) + self.encoding = encoding + self.errors = errors + self.newline = newline + self.fobjects = [] + + def __reduce__(self): + return ( + OpenFile, + ( + self.fs, + self.path, + self.mode, + self.compression, + self.encoding, + self.errors, + self.newline, + ), + ) + + def __repr__(self): + return f"" + + def __enter__(self): + mode = self.mode.replace("t", "").replace("b", "") + "b" + + try: + f = self.fs.open(self.path, mode=mode) + except FileNotFoundError as e: + if has_magic(self.path): + raise FileNotFoundError( + "%s not found. The URL contains glob characters: you maybe needed\n" + "to pass expand=True in fsspec.open() or the storage_options of \n" + "your library. You can also set the config value 'open_expand'\n" + "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.", + self.path, + ) from e + raise + + self.fobjects = [f] + + if self.compression is not None: + compress = compr[self.compression] + f = compress(f, mode=mode[0]) + self.fobjects.append(f) + + if "b" not in self.mode: + # assume, for example, that 'r' is equivalent to 'rt' as in builtin + f = PickleableTextIOWrapper( + f, encoding=self.encoding, errors=self.errors, newline=self.newline + ) + self.fobjects.append(f) + + return self.fobjects[-1] + + def __exit__(self, *args): + self.close() + + @property + def full_name(self): + return _unstrip_protocol(self.path, self.fs) + + def open(self): + """Materialise this as a real open file without context + + The OpenFile object should be explicitly closed to avoid enclosed file + instances persisting. You must, therefore, keep a reference to the OpenFile + during the life of the file-like it generates. + """ + return self.__enter__() + + def close(self): + """Close all encapsulated file objects""" + for f in reversed(self.fobjects): + if "r" not in self.mode and not f.closed: + f.flush() + f.close() + self.fobjects.clear() + + +class OpenFiles(list): + """List of OpenFile instances + + Can be used in a single context, which opens and closes all of the + contained files. Normal list access to get the elements works as + normal. + + A special case is made for caching filesystems - the files will + be down/uploaded together at the start or end of the context, and + this may happen concurrently, if the target filesystem supports it. + """ + + def __init__(self, *args, mode="rb", fs=None): + self.mode = mode + self.fs = fs + self.files = [] + super().__init__(*args) + + def __enter__(self): + if self.fs is None: + raise ValueError("Context has already been used") + + fs = self.fs + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache download; or set up for upload + self.files = fs.open_many(self) + return self.files + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + return [s.__enter__() for s in self] + + def __exit__(self, *args): + fs = self.fs + [s.__exit__(*args) for s in self] + if "r" not in self.mode: + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache upload + fs.commit_many(self.files) + return + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + + def __getitem__(self, item): + out = super().__getitem__(item) + if isinstance(item, slice): + return OpenFiles(out, mode=self.mode, fs=self.fs) + return out + + def __repr__(self): + return f"" + + +def open_files( + urlpath, + mode="rb", + compression=None, + encoding="utf8", + errors=None, + name_function=None, + num=1, + protocol=None, + newline=None, + auto_mkdir=True, + expand=True, + **kwargs, +): + """Given a path or paths, return a list of ``OpenFile`` objects. + + For writing, a str path must contain the "*" character, which will be filled + in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. + + For either reading or writing, can instead provide explicit list of paths. + + Parameters + ---------- + urlpath: string or list + Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` + to read from alternative filesystems. To read from multiple files you + can pass a globstring or a list of paths, with the caveat that they + must all have the same protocol. + mode: 'rb', 'wt', etc. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding: str + For text mode only + errors: None or str + Passed to TextIOWrapper in text mode + name_function: function or None + if opening a set of files for writing, those files do not yet exist, + so we need to generate their names by formatting the urlpath for + each sequence number + num: int [1] + if writing mode, number of files we expect to create (passed to + name+function) + protocol: str or None + If given, overrides the protocol found in the URL. + newline: bytes or None + Used for line terminator in text mode. If None, uses system default; + if blank, uses no translation. + auto_mkdir: bool (True) + If in write mode, this will ensure the target directory exists before + writing, by calling ``fs.mkdirs(exist_ok=True)``. + expand: bool + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Examples + -------- + >>> files = open_files('2015-*-*.csv') # doctest: +SKIP + >>> files = open_files( + ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' + ... ) # doctest: +SKIP + + Returns + ------- + An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can + be used as a single context + + Notes + ----- + For a full list of the available protocols and the implementations that + they map across to see the latest online documentation: + + - For implementations built into ``fsspec`` see + https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations + - For implementations in separate packages see + https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + """ + fs, fs_token, paths = get_fs_token_paths( + urlpath, + mode, + num=num, + name_function=name_function, + storage_options=kwargs, + protocol=protocol, + expand=expand, + ) + if fs.protocol == "file": + fs.auto_mkdir = auto_mkdir + elif "r" not in mode and auto_mkdir: + parents = {fs._parent(path) for path in paths} + for parent in parents: + try: + fs.makedirs(parent, exist_ok=True) + except PermissionError: + pass + return OpenFiles( + [ + OpenFile( + fs, + path, + mode=mode, + compression=compression, + encoding=encoding, + errors=errors, + newline=newline, + ) + for path in paths + ], + mode=mode, + fs=fs, + ) + + +def _un_chain(path, kwargs): + # Avoid a circular import + from fsspec.implementations.cached import CachingFileSystem + + if "::" in path: + x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word + bits = [] + for p in path.split("::"): + if "://" in p or x.match(p): + bits.append(p) + else: + bits.append(p + "://") + else: + bits = [path] + # [[url, protocol, kwargs], ...] + out = [] + previous_bit = None + kwargs = kwargs.copy() + for bit in reversed(bits): + protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file" + cls = get_filesystem_class(protocol) + extra_kwargs = cls._get_kwargs_from_urls(bit) + kws = kwargs.pop(protocol, {}) + if bit is bits[0]: + kws.update(kwargs) + kw = dict( + **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]}, + **kws, + ) + bit = cls._strip_protocol(bit) + if "target_protocol" not in kw and issubclass(cls, CachingFileSystem): + bit = previous_bit + out.append((bit, protocol, kw)) + previous_bit = bit + out.reverse() + return out + + +def url_to_fs(url, **kwargs): + """ + Turn fully-qualified and potentially chained URL into filesystem instance + + Parameters + ---------- + url : str + The fsspec-compatible URL + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Returns + ------- + filesystem : FileSystem + The new filesystem discovered from ``url`` and created with + ``**kwargs``. + urlpath : str + The file-systems-specific URL for ``url``. + """ + url = stringify_path(url) + # non-FS arguments that appear in fsspec.open() + # inspect could keep this in sync with open()'s signature + known_kwargs = { + "compression", + "encoding", + "errors", + "expand", + "mode", + "name_function", + "newline", + "num", + } + kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs} + chain = _un_chain(url, kwargs) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs["target_options"] = dict(**kw, **inkwargs) + inkwargs["target_protocol"] = protocol + inkwargs["fo"] = urls + urlpath, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + return fs, urlpath + + +DEFAULT_EXPAND = conf.get("open_expand", False) + + +def open( + urlpath, + mode="rb", + compression=None, + encoding="utf8", + errors=None, + protocol=None, + newline=None, + expand=None, + **kwargs, +): + """Given a path or paths, return one ``OpenFile`` object. + + Parameters + ---------- + urlpath: string or list + Absolute or relative filepath. Prefix with a protocol like ``s3://`` + to read from alternative filesystems. Should not include glob + character(s). + mode: 'rb', 'wt', etc. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding: str + For text mode only + errors: None or str + Passed to TextIOWrapper in text mode + protocol: str or None + If given, overrides the protocol found in the URL. + newline: bytes or None + Used for line terminator in text mode. If None, uses system default; + if blank, uses no translation. + expand: bool or Nonw + Whether to regard file paths containing special glob characters as needing + expansion (finding the first match) or absolute. Setting False allows using + paths which do embed such characters. If None (default), this argument + takes its value from the DEFAULT_EXPAND module variable, which takes + its initial value from the "open_expand" config value at startup, which will + be False if not set. + **kwargs: dict + Extra options that make sense to a particular storage connection, e.g. + host, port, username, password, etc. + + Examples + -------- + >>> openfile = open('2015-01-01.csv') # doctest: +SKIP + >>> openfile = open( + ... 's3://bucket/2015-01-01.csv.gz', compression='gzip' + ... ) # doctest: +SKIP + >>> with openfile as f: + ... df = pd.read_csv(f) # doctest: +SKIP + ... + + Returns + ------- + ``OpenFile`` object. + + Notes + ----- + For a full list of the available protocols and the implementations that + they map across to see the latest online documentation: + + - For implementations built into ``fsspec`` see + https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations + - For implementations in separate packages see + https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + """ + expand = DEFAULT_EXPAND if expand is None else expand + out = open_files( + urlpath=[urlpath], + mode=mode, + compression=compression, + encoding=encoding, + errors=errors, + protocol=protocol, + newline=newline, + expand=expand, + **kwargs, + ) + if not out: + raise FileNotFoundError(urlpath) + return out[0] + + +def open_local( + url: str | list[str] | Path | list[Path], + mode: str = "rb", + **storage_options: dict, +) -> str | list[str]: + """Open file(s) which can be resolved to local + + For files which either are local, or get downloaded upon open + (e.g., by file caching) + + Parameters + ---------- + url: str or list(str) + mode: str + Must be read mode + storage_options: + passed on to FS for or used by open_files (e.g., compression) + """ + if "r" not in mode: + raise ValueError("Can only ensure local files when reading") + of = open_files(url, mode=mode, **storage_options) + if not getattr(of[0].fs, "local_file", False): + raise ValueError( + "open_local can only be used on a filesystem which" + " has attribute local_file=True" + ) + with of as files: + paths = [f.name for f in files] + if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path): + return paths[0] + return paths + + +def get_compression(urlpath, compression): + if compression == "infer": + compression = infer_compression(urlpath) + if compression is not None and compression not in compr: + raise ValueError(f"Compression type {compression} not supported") + return compression + + +def split_protocol(urlpath): + """Return protocol, path pair""" + urlpath = stringify_path(urlpath) + if "://" in urlpath: + protocol, path = urlpath.split("://", 1) + if len(protocol) > 1: + # excludes Windows paths + return protocol, path + if urlpath.startswith("data:"): + return urlpath.split(":", 1) + return None, urlpath + + +def strip_protocol(urlpath): + """Return only path part of full URL, according to appropriate backend""" + protocol, _ = split_protocol(urlpath) + cls = get_filesystem_class(protocol) + return cls._strip_protocol(urlpath) + + +def expand_paths_if_needed(paths, mode, num, fs, name_function): + """Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]`` + in them (read mode). + + :param paths: list of paths + mode: str + Mode in which to open files. + num: int + If opening in writing mode, number of files we expect to create. + fs: filesystem object + name_function: callable + If opening in writing mode, this callable is used to generate path + names. Names are generated for each partition by + ``urlpath.replace('*', name_function(partition_index))``. + :return: list of paths + """ + expanded_paths = [] + paths = list(paths) + + if "w" in mode: # read mode + if sum(1 for p in paths if "*" in p) > 1: + raise ValueError( + "When writing data, only one filename mask can be specified." + ) + num = max(num, len(paths)) + + for curr_path in paths: + if "*" in curr_path: + # expand using name_function + expanded_paths.extend(_expand_paths(curr_path, name_function, num)) + else: + expanded_paths.append(curr_path) + # if we generated more paths that asked for, trim the list + if len(expanded_paths) > num: + expanded_paths = expanded_paths[:num] + + else: # read mode + for curr_path in paths: + if has_magic(curr_path): + # expand using glob + expanded_paths.extend(fs.glob(curr_path)) + else: + expanded_paths.append(curr_path) + + return expanded_paths + + +def get_fs_token_paths( + urlpath, + mode="rb", + num=1, + name_function=None, + storage_options=None, + protocol=None, + expand=True, +): + """Filesystem, deterministic token, and paths from a urlpath and options. + + Parameters + ---------- + urlpath: string or iterable + Absolute or relative filepath, URL (may include protocols like + ``s3://``), or globstring pointing to data. + mode: str, optional + Mode in which to open files. + num: int, optional + If opening in writing mode, number of files we expect to create. + name_function: callable, optional + If opening in writing mode, this callable is used to generate path + names. Names are generated for each partition by + ``urlpath.replace('*', name_function(partition_index))``. + storage_options: dict, optional + Additional keywords to pass to the filesystem class. + protocol: str or None + To override the protocol specifier in the URL + expand: bool + Expand string paths for writing, assuming the path is a directory + """ + if isinstance(urlpath, (list, tuple, set)): + if not urlpath: + raise ValueError("empty urlpath sequence") + urlpath0 = stringify_path(next(iter(urlpath))) + else: + urlpath0 = stringify_path(urlpath) + storage_options = storage_options or {} + if protocol: + storage_options["protocol"] = protocol + chain = _un_chain(urlpath0, storage_options or {}) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, nested_protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs["target_options"] = dict(**kw, **inkwargs) + inkwargs["target_protocol"] = nested_protocol + inkwargs["fo"] = urls + paths, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + if isinstance(urlpath, (list, tuple, set)): + pchains = [ + _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath + ] + if len({pc[1] for pc in pchains}) > 1: + raise ValueError("Protocol mismatch getting fs from %s", urlpath) + paths = [pc[0] for pc in pchains] + else: + paths = fs._strip_protocol(paths) + if isinstance(paths, (list, tuple, set)): + if expand: + paths = expand_paths_if_needed(paths, mode, num, fs, name_function) + elif not isinstance(paths, list): + paths = list(paths) + else: + if ("w" in mode or "x" in mode) and expand: + paths = _expand_paths(paths, name_function, num) + elif "*" in paths: + paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)] + else: + paths = [paths] + + return fs, fs._fs_token, paths + + +def _expand_paths(path, name_function, num): + if isinstance(path, str): + if path.count("*") > 1: + raise ValueError("Output path spec must contain exactly one '*'.") + elif "*" not in path: + path = os.path.join(path, "*.part") + + if name_function is None: + name_function = build_name_function(num - 1) + + paths = [path.replace("*", name_function(i)) for i in range(num)] + if paths != sorted(paths): + logger.warning( + "In order to preserve order between partitions" + " paths created with ``name_function`` should " + "sort to partition order" + ) + elif isinstance(path, (tuple, list)): + assert len(path) == num + paths = list(path) + else: + raise ValueError( + "Path should be either\n" + "1. A list of paths: ['foo.json', 'bar.json', ...]\n" + "2. A directory: 'foo/\n" + "3. A path with a '*' in it: 'foo.*.json'" + ) + return paths + + +class PickleableTextIOWrapper(io.TextIOWrapper): + """TextIOWrapper cannot be pickled. This solves it. + + Requires that ``buffer`` be pickleable, which all instances of + AbstractBufferedFile are. + """ + + def __init__( + self, + buffer, + encoding=None, + errors=None, + newline=None, + line_buffering=False, + write_through=False, + ): + self.args = buffer, encoding, errors, newline, line_buffering, write_through + super().__init__(*self.args) + + def __reduce__(self): + return PickleableTextIOWrapper, self.args diff --git a/.venv/lib/python3.11/site-packages/fsspec/dircache.py b/.venv/lib/python3.11/site-packages/fsspec/dircache.py new file mode 100644 index 0000000000000000000000000000000000000000..eca19566b135e5a7a4f6e7407d56411ec58bfe44 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/dircache.py @@ -0,0 +1,98 @@ +import time +from collections.abc import MutableMapping +from functools import lru_cache + + +class DirCache(MutableMapping): + """ + Caching of directory listings, in a structure like:: + + {"path0": [ + {"name": "path0/file0", + "size": 123, + "type": "file", + ... + }, + {"name": "path0/file1", + }, + ... + ], + "path1": [...] + } + + Parameters to this class control listing expiry or indeed turn + caching off + """ + + def __init__( + self, + use_listings_cache=True, + listings_expiry_time=None, + max_paths=None, + **kwargs, + ): + """ + + Parameters + ---------- + use_listings_cache: bool + If False, this cache never returns items, but always reports KeyError, + and setting items has no effect + listings_expiry_time: int or float (optional) + Time in seconds that a listing is considered valid. If None, + listings do not expire. + max_paths: int (optional) + The number of most recent listings that are considered valid; 'recent' + refers to when the entry was set. + """ + self._cache = {} + self._times = {} + if max_paths: + self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None)) + self.use_listings_cache = use_listings_cache + self.listings_expiry_time = listings_expiry_time + self.max_paths = max_paths + + def __getitem__(self, item): + if self.listings_expiry_time is not None: + if self._times.get(item, 0) - time.time() < -self.listings_expiry_time: + del self._cache[item] + if self.max_paths: + self._q(item) + return self._cache[item] # maybe raises KeyError + + def clear(self): + self._cache.clear() + + def __len__(self): + return len(self._cache) + + def __contains__(self, item): + try: + self[item] + return True + except KeyError: + return False + + def __setitem__(self, key, value): + if not self.use_listings_cache: + return + if self.max_paths: + self._q(key) + self._cache[key] = value + if self.listings_expiry_time is not None: + self._times[key] = time.time() + + def __delitem__(self, key): + del self._cache[key] + + def __iter__(self): + entries = list(self._cache) + + return (k for k in entries if k in self) + + def __reduce__(self): + return ( + DirCache, + (self.use_listings_cache, self.listings_expiry_time, self.max_paths), + ) diff --git a/.venv/lib/python3.11/site-packages/fsspec/exceptions.py b/.venv/lib/python3.11/site-packages/fsspec/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..ae8905475f02655f4fc5863931d99ca9da55db78 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/exceptions.py @@ -0,0 +1,18 @@ +""" +fsspec user-defined exception classes +""" + +import asyncio + + +class BlocksizeMismatchError(ValueError): + """ + Raised when a cached file is opened with a different blocksize than it was + written with + """ + + +class FSTimeoutError(asyncio.TimeoutError): + """ + Raised when a fsspec function timed out occurs + """ diff --git a/.venv/lib/python3.11/site-packages/fsspec/fuse.py b/.venv/lib/python3.11/site-packages/fsspec/fuse.py new file mode 100644 index 0000000000000000000000000000000000000000..566d520fce3e94e3bbaee48c3c6acc9f1db315a8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/fuse.py @@ -0,0 +1,324 @@ +import argparse +import logging +import os +import stat +import threading +import time +from errno import EIO, ENOENT + +from fuse import FUSE, FuseOSError, LoggingMixIn, Operations + +from fsspec import __version__ +from fsspec.core import url_to_fs + +logger = logging.getLogger("fsspec.fuse") + + +class FUSEr(Operations): + def __init__(self, fs, path, ready_file=False): + self.fs = fs + self.cache = {} + self.root = path.rstrip("/") + "/" + self.counter = 0 + logger.info("Starting FUSE at %s", path) + self._ready_file = ready_file + + def getattr(self, path, fh=None): + logger.debug("getattr %s", path) + if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]: + return {"type": "file", "st_size": 5} + + path = "".join([self.root, path.lstrip("/")]).rstrip("/") + try: + info = self.fs.info(path) + except FileNotFoundError as exc: + raise FuseOSError(ENOENT) from exc + + data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)} + perm = info.get("mode", 0o777) + + if info["type"] != "file": + data["st_mode"] = stat.S_IFDIR | perm + data["st_size"] = 0 + data["st_blksize"] = 0 + else: + data["st_mode"] = stat.S_IFREG | perm + data["st_size"] = info["size"] + data["st_blksize"] = 5 * 2**20 + data["st_nlink"] = 1 + data["st_atime"] = info["atime"] if "atime" in info else time.time() + data["st_ctime"] = info["ctime"] if "ctime" in info else time.time() + data["st_mtime"] = info["mtime"] if "mtime" in info else time.time() + return data + + def readdir(self, path, fh): + logger.debug("readdir %s", path) + path = "".join([self.root, path.lstrip("/")]) + files = self.fs.ls(path, False) + files = [os.path.basename(f.rstrip("/")) for f in files] + return [".", ".."] + files + + def mkdir(self, path, mode): + path = "".join([self.root, path.lstrip("/")]) + self.fs.mkdir(path) + return 0 + + def rmdir(self, path): + path = "".join([self.root, path.lstrip("/")]) + self.fs.rmdir(path) + return 0 + + def read(self, path, size, offset, fh): + logger.debug("read %s", (path, size, offset)) + if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]: + # status indicator + return b"ready" + + f = self.cache[fh] + f.seek(offset) + out = f.read(size) + return out + + def write(self, path, data, offset, fh): + logger.debug("write %s", (path, offset)) + f = self.cache[fh] + f.seek(offset) + f.write(data) + return len(data) + + def create(self, path, flags, fi=None): + logger.debug("create %s", (path, flags)) + fn = "".join([self.root, path.lstrip("/")]) + self.fs.touch(fn) # OS will want to get attributes immediately + f = self.fs.open(fn, "wb") + self.cache[self.counter] = f + self.counter += 1 + return self.counter - 1 + + def open(self, path, flags): + logger.debug("open %s", (path, flags)) + fn = "".join([self.root, path.lstrip("/")]) + if flags % 2 == 0: + # read + mode = "rb" + else: + # write/create + mode = "wb" + self.cache[self.counter] = self.fs.open(fn, mode) + self.counter += 1 + return self.counter - 1 + + def truncate(self, path, length, fh=None): + fn = "".join([self.root, path.lstrip("/")]) + if length != 0: + raise NotImplementedError + # maybe should be no-op since open with write sets size to zero anyway + self.fs.touch(fn) + + def unlink(self, path): + fn = "".join([self.root, path.lstrip("/")]) + try: + self.fs.rm(fn, False) + except (OSError, FileNotFoundError) as exc: + raise FuseOSError(EIO) from exc + + def release(self, path, fh): + try: + if fh in self.cache: + f = self.cache[fh] + f.close() + self.cache.pop(fh) + except Exception as e: + print(e) + return 0 + + def chmod(self, path, mode): + if hasattr(self.fs, "chmod"): + path = "".join([self.root, path.lstrip("/")]) + return self.fs.chmod(path, mode) + raise NotImplementedError + + +def run( + fs, + path, + mount_point, + foreground=True, + threads=False, + ready_file=False, + ops_class=FUSEr, +): + """Mount stuff in a local directory + + This uses fusepy to make it appear as if a given path on an fsspec + instance is in fact resident within the local file-system. + + This requires that fusepy by installed, and that FUSE be available on + the system (typically requiring a package to be installed with + apt, yum, brew, etc.). + + Parameters + ---------- + fs: file-system instance + From one of the compatible implementations + path: str + Location on that file-system to regard as the root directory to + mount. Note that you typically should include the terminating "/" + character. + mount_point: str + An empty directory on the local file-system where the contents of + the remote path will appear. + foreground: bool + Whether or not calling this function will block. Operation will + typically be more stable if True. + threads: bool + Whether or not to create threads when responding to file operations + within the mounter directory. Operation will typically be more + stable if False. + ready_file: bool + Whether the FUSE process is ready. The ``.fuse_ready`` file will + exist in the ``mount_point`` directory if True. Debugging purpose. + ops_class: FUSEr or Subclass of FUSEr + To override the default behavior of FUSEr. For Example, logging + to file. + + """ + func = lambda: FUSE( + ops_class(fs, path, ready_file=ready_file), + mount_point, + nothreads=not threads, + foreground=foreground, + ) + if not foreground: + th = threading.Thread(target=func) + th.daemon = True + th.start() + return th + else: # pragma: no cover + try: + func() + except KeyboardInterrupt: + pass + + +def main(args): + """Mount filesystem from chained URL to MOUNT_POINT. + + Examples: + + python3 -m fsspec.fuse memory /usr/share /tmp/mem + + python3 -m fsspec.fuse local /tmp/source /tmp/local \\ + -l /tmp/fsspecfuse.log + + You can also mount chained-URLs and use special settings: + + python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\ + / /tmp/zip \\ + -o 'filecache-cache_storage=/tmp/simplecache' + + You can specify the type of the setting by using `[int]` or `[bool]`, + (`true`, `yes`, `1` represents the Boolean value `True`): + + python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\ + /historic/packages/RPMS /tmp/ftp \\ + -o 'simplecache-cache_storage=/tmp/simplecache' \\ + -o 'simplecache-check_files=false[bool]' \\ + -o 'ftp-listings_expiry_time=60[int]' \\ + -o 'ftp-username=anonymous' \\ + -o 'ftp-password=xieyanbo' + """ + + class RawDescriptionArgumentParser(argparse.ArgumentParser): + def format_help(self): + usage = super().format_help() + parts = usage.split("\n\n") + parts[1] = self.description.rstrip() + return "\n\n".join(parts) + + parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__) + parser.add_argument("--version", action="version", version=__version__) + parser.add_argument("url", type=str, help="fs url") + parser.add_argument("source_path", type=str, help="source directory in fs") + parser.add_argument("mount_point", type=str, help="local directory") + parser.add_argument( + "-o", + "--option", + action="append", + help="Any options of protocol included in the chained URL", + ) + parser.add_argument( + "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')" + ) + parser.add_argument( + "-f", + "--foreground", + action="store_false", + help="Running in foreground or not (Default: False)", + ) + parser.add_argument( + "-t", + "--threads", + action="store_false", + help="Running with threads support (Default: False)", + ) + parser.add_argument( + "-r", + "--ready-file", + action="store_false", + help="The `.fuse_ready` file will exist after FUSE is ready. " + "(Debugging purpose, Default: False)", + ) + args = parser.parse_args(args) + + kwargs = {} + for item in args.option or []: + key, sep, value = item.partition("=") + if not sep: + parser.error(message=f"Wrong option: {item!r}") + val = value.lower() + if val.endswith("[int]"): + value = int(value[: -len("[int]")]) + elif val.endswith("[bool]"): + value = val[: -len("[bool]")] in ["1", "yes", "true"] + + if "-" in key: + fs_name, setting_name = key.split("-", 1) + if fs_name in kwargs: + kwargs[fs_name][setting_name] = value + else: + kwargs[fs_name] = {setting_name: value} + else: + kwargs[key] = value + + if args.log_file: + logging.basicConfig( + level=logging.DEBUG, + filename=args.log_file, + format="%(asctime)s %(message)s", + ) + + class LoggingFUSEr(FUSEr, LoggingMixIn): + pass + + fuser = LoggingFUSEr + else: + fuser = FUSEr + + fs, url_path = url_to_fs(args.url, **kwargs) + logger.debug("Mounting %s to %s", url_path, str(args.mount_point)) + run( + fs, + args.source_path, + args.mount_point, + foreground=args.foreground, + threads=args.threads, + ready_file=args.ready_file, + ops_class=fuser, + ) + + +if __name__ == "__main__": + import sys + + main(sys.argv[1:]) diff --git a/.venv/lib/python3.11/site-packages/fsspec/generic.py b/.venv/lib/python3.11/site-packages/fsspec/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..9bad0f048f737400b4cd919f7699d15fbfbf7c62 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/generic.py @@ -0,0 +1,411 @@ +from __future__ import annotations + +import inspect +import logging +import os +import shutil +import uuid +from typing import Optional + +from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper +from .callbacks import DEFAULT_CALLBACK +from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs + +_generic_fs = {} +logger = logging.getLogger("fsspec.generic") + + +def set_generic_fs(protocol, **storage_options): + _generic_fs[protocol] = filesystem(protocol, **storage_options) + + +default_method = "default" + + +def _resolve_fs(url, method=None, protocol=None, storage_options=None): + """Pick instance of backend FS""" + method = method or default_method + protocol = protocol or split_protocol(url)[0] + storage_options = storage_options or {} + if method == "default": + return filesystem(protocol) + if method == "generic": + return _generic_fs[protocol] + if method == "current": + cls = get_filesystem_class(protocol) + return cls.current() + if method == "options": + fs, _ = url_to_fs(url, **storage_options.get(protocol, {})) + return fs + raise ValueError(f"Unknown FS resolution method: {method}") + + +def rsync( + source, + destination, + delete_missing=False, + source_field="size", + dest_field="size", + update_cond="different", + inst_kwargs=None, + fs=None, + **kwargs, +): + """Sync files between two directory trees + + (experimental) + + Parameters + ---------- + source: str + Root of the directory tree to take files from. This must be a directory, but + do not include any terminating "/" character + destination: str + Root path to copy into. The contents of this location should be + identical to the contents of ``source`` when done. This will be made a + directory, and the terminal "/" should not be included. + delete_missing: bool + If there are paths in the destination that don't exist in the + source and this is True, delete them. Otherwise, leave them alone. + source_field: str | callable + If ``update_field`` is "different", this is the key in the info + of source files to consider for difference. Maybe a function of the + info dict. + dest_field: str | callable + If ``update_field`` is "different", this is the key in the info + of destination files to consider for difference. May be a function of + the info dict. + update_cond: "different"|"always"|"never" + If "always", every file is copied, regardless of whether it exists in + the destination. If "never", files that exist in the destination are + not copied again. If "different" (default), only copy if the info + fields given by ``source_field`` and ``dest_field`` (usually "size") + are different. Other comparisons may be added in the future. + inst_kwargs: dict|None + If ``fs`` is None, use this set of keyword arguments to make a + GenericFileSystem instance + fs: GenericFileSystem|None + Instance to use if explicitly given. The instance defines how to + to make downstream file system instances from paths. + + Returns + ------- + dict of the copy operations that were performed, {source: destination} + """ + fs = fs or GenericFileSystem(**(inst_kwargs or {})) + source = fs._strip_protocol(source) + destination = fs._strip_protocol(destination) + allfiles = fs.find(source, withdirs=True, detail=True) + if not fs.isdir(source): + raise ValueError("Can only rsync on a directory") + otherfiles = fs.find(destination, withdirs=True, detail=True) + dirs = [ + a + for a, v in allfiles.items() + if v["type"] == "directory" and a.replace(source, destination) not in otherfiles + ] + logger.debug(f"{len(dirs)} directories to create") + if dirs: + fs.make_many_dirs( + [dirn.replace(source, destination) for dirn in dirs], exist_ok=True + ) + allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"} + logger.debug(f"{len(allfiles)} files to consider for copy") + to_delete = [ + o + for o, v in otherfiles.items() + if o.replace(destination, source) not in allfiles and v["type"] == "file" + ] + for k, v in allfiles.copy().items(): + otherfile = k.replace(source, destination) + if otherfile in otherfiles: + if update_cond == "always": + allfiles[k] = otherfile + elif update_cond == "different": + inf1 = source_field(v) if callable(source_field) else v[source_field] + v2 = otherfiles[otherfile] + inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field] + if inf1 != inf2: + # details mismatch, make copy + allfiles[k] = otherfile + else: + # details match, don't copy + allfiles.pop(k) + else: + # file not in target yet + allfiles[k] = otherfile + logger.debug(f"{len(allfiles)} files to copy") + if allfiles: + source_files, target_files = zip(*allfiles.items()) + fs.cp(source_files, target_files, **kwargs) + logger.debug(f"{len(to_delete)} files to delete") + if delete_missing and to_delete: + fs.rm(to_delete) + return allfiles + + +class GenericFileSystem(AsyncFileSystem): + """Wrapper over all other FS types + + + + This implementation is a single unified interface to be able to run FS operations + over generic URLs, and dispatch to the specific implementations using the URL + protocol prefix. + + Note: instances of this FS are always async, even if you never use it with any async + backend. + """ + + protocol = "generic" # there is no real reason to ever use a protocol with this FS + + def __init__(self, default_method="default", **kwargs): + """ + + Parameters + ---------- + default_method: str (optional) + Defines how to configure backend FS instances. Options are: + - "default": instantiate like FSClass(), with no + extra arguments; this is the default instance of that FS, and can be + configured via the config system + - "generic": takes instances from the `_generic_fs` dict in this module, + which you must populate before use. Keys are by protocol + - "current": takes the most recently instantiated version of each FS + """ + self.method = default_method + super().__init__(**kwargs) + + def _parent(self, path): + fs = _resolve_fs(path, self.method) + return fs.unstrip_protocol(fs._parent(path)) + + def _strip_protocol(self, path): + # normalization only + fs = _resolve_fs(path, self.method) + return fs.unstrip_protocol(fs._strip_protocol(path)) + + async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + fs = _resolve_fs(path, self.method) + if fs.async_impl: + out = await fs._find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs + ) + else: + out = fs.find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs + ) + result = {} + for k, v in out.items(): + v = v.copy() # don't corrupt target FS dircache + name = fs.unstrip_protocol(k) + v["name"] = name + result[name] = v + if detail: + return result + return list(result) + + async def _info(self, url, **kwargs): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + out = await fs._info(url, **kwargs) + else: + out = fs.info(url, **kwargs) + out = out.copy() # don't edit originals + out["name"] = fs.unstrip_protocol(out["name"]) + return out + + async def _ls( + self, + url, + detail=True, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + out = await fs._ls(url, detail=True, **kwargs) + else: + out = fs.ls(url, detail=True, **kwargs) + out = [o.copy() for o in out] # don't edit originals + for o in out: + o["name"] = fs.unstrip_protocol(o["name"]) + if detail: + return out + else: + return [o["name"] for o in out] + + async def _cat_file( + self, + url, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + if fs.async_impl: + return await fs._cat_file(url, **kwargs) + else: + return fs.cat_file(url, **kwargs) + + async def _pipe_file( + self, + path, + value, + **kwargs, + ): + fs = _resolve_fs(path, self.method) + if fs.async_impl: + return await fs._pipe_file(path, value, **kwargs) + else: + return fs.pipe_file(path, value, **kwargs) + + async def _rm(self, url, **kwargs): + urls = url + if isinstance(urls, str): + urls = [urls] + fs = _resolve_fs(urls[0], self.method) + if fs.async_impl: + await fs._rm(urls, **kwargs) + else: + fs.rm(url, **kwargs) + + async def _makedirs(self, path, exist_ok=False): + logger.debug("Make dir %s", path) + fs = _resolve_fs(path, self.method) + if fs.async_impl: + await fs._makedirs(path, exist_ok=exist_ok) + else: + fs.makedirs(path, exist_ok=exist_ok) + + def rsync(self, source, destination, **kwargs): + """Sync files between two directory trees + + See `func:rsync` for more details. + """ + rsync(source, destination, fs=self, **kwargs) + + async def _cp_file( + self, + url, + url2, + blocksize=2**20, + callback=DEFAULT_CALLBACK, + **kwargs, + ): + fs = _resolve_fs(url, self.method) + fs2 = _resolve_fs(url2, self.method) + if fs is fs2: + # pure remote + if fs.async_impl: + return await fs._cp_file(url, url2, **kwargs) + else: + return fs.cp_file(url, url2, **kwargs) + kw = {"blocksize": 0, "cache_type": "none"} + try: + f1 = ( + await fs.open_async(url, "rb") + if hasattr(fs, "open_async") + else fs.open(url, "rb", **kw) + ) + callback.set_size(await maybe_await(f1.size)) + f2 = ( + await fs2.open_async(url2, "wb") + if hasattr(fs2, "open_async") + else fs2.open(url2, "wb", **kw) + ) + while f1.size is None or f2.tell() < f1.size: + data = await maybe_await(f1.read(blocksize)) + if f1.size is None and not data: + break + await maybe_await(f2.write(data)) + callback.absolute_update(f2.tell()) + finally: + try: + await maybe_await(f2.close()) + await maybe_await(f1.close()) + except NameError: + # fail while opening f1 or f2 + pass + + async def _make_many_dirs(self, urls, exist_ok=True): + fs = _resolve_fs(urls[0], self.method) + if fs.async_impl: + coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls] + await _run_coros_in_chunks(coros) + else: + for u in urls: + fs.makedirs(u, exist_ok=exist_ok) + + make_many_dirs = sync_wrapper(_make_many_dirs) + + async def _copy( + self, + path1: list[str], + path2: list[str], + recursive: bool = False, + on_error: str = "ignore", + maxdepth: Optional[int] = None, + batch_size: Optional[int] = None, + tempdir: Optional[str] = None, + **kwargs, + ): + if recursive: + raise NotImplementedError + fs = _resolve_fs(path1[0], self.method) + fs2 = _resolve_fs(path2[0], self.method) + # not expanding paths atm., assume call is from rsync() + if fs is fs2: + # pure remote + if fs.async_impl: + return await fs._copy(path1, path2, **kwargs) + else: + return fs.copy(path1, path2, **kwargs) + await copy_file_op( + fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error + ) + + +async def copy_file_op( + fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore" +): + import tempfile + + tempdir = tempdir or tempfile.mkdtemp() + try: + coros = [ + _copy_file_op( + fs1, + u1, + fs2, + u2, + os.path.join(tempdir, uuid.uuid4().hex), + on_error=on_error, + ) + for u1, u2 in zip(url1, url2) + ] + await _run_coros_in_chunks(coros, batch_size=batch_size) + finally: + shutil.rmtree(tempdir) + + +async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"): + ex = () if on_error == "raise" else Exception + logger.debug("Copy %s -> %s", url1, url2) + try: + if fs1.async_impl: + await fs1._get_file(url1, local) + else: + fs1.get_file(url1, local) + if fs2.async_impl: + await fs2._put_file(local, url2) + else: + fs2.put_file(local, url2) + os.unlink(local) + logger.debug("Copy %s -> %s; done", url1, url2) + except ex as e: + logger.debug("ignoring cp exception for %s: %s", url1, e) + + +async def maybe_await(cor): + if inspect.iscoroutine(cor): + return await cor + else: + return cor diff --git a/.venv/lib/python3.11/site-packages/fsspec/gui.py b/.venv/lib/python3.11/site-packages/fsspec/gui.py new file mode 100644 index 0000000000000000000000000000000000000000..321379eb8ef924302cc2f56bccb3bdb873ff86e3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/gui.py @@ -0,0 +1,416 @@ +import ast +import contextlib +import logging +import os +import re +from typing import ClassVar, Sequence + +import panel as pn + +from .core import OpenFile, get_filesystem_class, split_protocol +from .registry import known_implementations + +pn.extension() +logger = logging.getLogger("fsspec.gui") + + +class SigSlot: + """Signal-slot mixin, for Panel event passing + + Include this class in a widget manager's superclasses to be able to + register events and callbacks on Panel widgets managed by that class. + + The method ``_register`` should be called as widgets are added, and external + code should call ``connect`` to associate callbacks. + + By default, all signals emit a DEBUG logging statement. + """ + + # names of signals that this class may emit each of which must be + # set by _register for any new instance + signals: ClassVar[Sequence[str]] = [] + # names of actions that this class may respond to + slots: ClassVar[Sequence[str]] = [] + + # each of which must be a method name + + def __init__(self): + self._ignoring_events = False + self._sigs = {} + self._map = {} + self._setup() + + def _setup(self): + """Create GUI elements and register signals""" + self.panel = pn.pane.PaneBase() + # no signals to set up in the base class + + def _register( + self, widget, name, thing="value", log_level=logging.DEBUG, auto=False + ): + """Watch the given attribute of a widget and assign it a named event + + This is normally called at the time a widget is instantiated, in the + class which owns it. + + Parameters + ---------- + widget : pn.layout.Panel or None + Widget to watch. If None, an anonymous signal not associated with + any widget. + name : str + Name of this event + thing : str + Attribute of the given widget to watch + log_level : int + When the signal is triggered, a logging event of the given level + will be fired in the dfviz logger. + auto : bool + If True, automatically connects with a method in this class of the + same name. + """ + if name not in self.signals: + raise ValueError(f"Attempt to assign an undeclared signal: {name}") + self._sigs[name] = { + "widget": widget, + "callbacks": [], + "thing": thing, + "log": log_level, + } + wn = "-".join( + [ + getattr(widget, "name", str(widget)) if widget is not None else "none", + thing, + ] + ) + self._map[wn] = name + if widget is not None: + widget.param.watch(self._signal, thing, onlychanged=True) + if auto and hasattr(self, name): + self.connect(name, getattr(self, name)) + + def _repr_mimebundle_(self, *args, **kwargs): + """Display in a notebook or a server""" + try: + return self.panel._repr_mimebundle_(*args, **kwargs) + except (ValueError, AttributeError) as exc: + raise NotImplementedError( + "Panel does not seem to be set up properly" + ) from exc + + def connect(self, signal, slot): + """Associate call back with given event + + The callback must be a function which takes the "new" value of the + watched attribute as the only parameter. If the callback return False, + this cancels any further processing of the given event. + + Alternatively, the callback can be a string, in which case it means + emitting the correspondingly-named event (i.e., connect to self) + """ + self._sigs[signal]["callbacks"].append(slot) + + def _signal(self, event): + """This is called by a an action on a widget + + Within an self.ignore_events context, nothing happens. + + Tests can execute this method by directly changing the values of + widget components. + """ + if not self._ignoring_events: + wn = "-".join([event.obj.name, event.name]) + if wn in self._map and self._map[wn] in self._sigs: + self._emit(self._map[wn], event.new) + + @contextlib.contextmanager + def ignore_events(self): + """Temporarily turn off events processing in this instance + + (does not propagate to children) + """ + self._ignoring_events = True + try: + yield + finally: + self._ignoring_events = False + + def _emit(self, sig, value=None): + """An event happened, call its callbacks + + This method can be used in tests to simulate message passing without + directly changing visual elements. + + Calling of callbacks will halt whenever one returns False. + """ + logger.log(self._sigs[sig]["log"], f"{sig}: {value}") + for callback in self._sigs[sig]["callbacks"]: + if isinstance(callback, str): + self._emit(callback) + else: + try: + # running callbacks should not break the interface + ret = callback(value) + if ret is False: + break + except Exception as e: + logger.exception( + "Exception (%s) while executing callback for signal: %s", + e, + sig, + ) + + def show(self, threads=False): + """Open a new browser tab and display this instance's interface""" + self.panel.show(threads=threads, verbose=False) + return self + + +class SingleSelect(SigSlot): + """A multiselect which only allows you to select one item for an event""" + + signals = ["_selected", "selected"] # the first is internal + slots = ["set_options", "set_selection", "add", "clear", "select"] + + def __init__(self, **kwargs): + self.kwargs = kwargs + super().__init__() + + def _setup(self): + self.panel = pn.widgets.MultiSelect(**self.kwargs) + self._register(self.panel, "_selected", "value") + self._register(None, "selected") + self.connect("_selected", self.select_one) + + def _signal(self, *args, **kwargs): + super()._signal(*args, **kwargs) + + def select_one(self, *_): + with self.ignore_events(): + val = [self.panel.value[-1]] if self.panel.value else [] + self.panel.value = val + self._emit("selected", self.panel.value) + + def set_options(self, options): + self.panel.options = options + + def clear(self): + self.panel.options = [] + + @property + def value(self): + return self.panel.value + + def set_selection(self, selection): + self.panel.value = [selection] + + +class FileSelector(SigSlot): + """Panel-based graphical file selector widget + + Instances of this widget are interactive and can be displayed in jupyter by having + them as the output of a cell, or in a separate browser tab using ``.show()``. + """ + + signals = [ + "protocol_changed", + "selection_changed", + "directory_entered", + "home_clicked", + "up_clicked", + "go_clicked", + "filters_changed", + ] + slots = ["set_filters", "go_home"] + + def __init__(self, url=None, filters=None, ignore=None, kwargs=None): + """ + + Parameters + ---------- + url : str (optional) + Initial value of the URL to populate the dialog; should include protocol + filters : list(str) (optional) + File endings to include in the listings. If not included, all files are + allowed. Does not affect directories. + If given, the endings will appear as checkboxes in the interface + ignore : list(str) (optional) + Regex(s) of file basename patterns to ignore, e.g., "\\." for typical + hidden files on posix + kwargs : dict (optional) + To pass to file system instance + """ + if url: + self.init_protocol, url = split_protocol(url) + else: + self.init_protocol, url = "file", os.getcwd() + self.init_url = url + self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}" + self.filters = filters + self.ignore = [re.compile(i) for i in ignore or []] + self._fs = None + super().__init__() + + def _setup(self): + self.url = pn.widgets.TextInput( + name="url", + value=self.init_url, + align="end", + sizing_mode="stretch_width", + width_policy="max", + ) + self.protocol = pn.widgets.Select( + options=sorted(known_implementations), + value=self.init_protocol, + name="protocol", + align="center", + ) + self.kwargs = pn.widgets.TextInput( + name="kwargs", value=self.init_kwargs, align="center" + ) + self.go = pn.widgets.Button(name="⇨", align="end", width=45) + self.main = SingleSelect(size=10) + self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end") + self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end") + + self._register(self.protocol, "protocol_changed", auto=True) + self._register(self.go, "go_clicked", "clicks", auto=True) + self._register(self.up, "up_clicked", "clicks", auto=True) + self._register(self.home, "home_clicked", "clicks", auto=True) + self._register(None, "selection_changed") + self.main.connect("selected", self.selection_changed) + self._register(None, "directory_entered") + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + + self.filter_sel = pn.widgets.CheckBoxGroup( + value=[], options=[], inline=False, align="end", width_policy="min" + ) + self._register(self.filter_sel, "filters_changed", auto=True) + + self.panel = pn.Column( + pn.Row(self.protocol, self.kwargs), + pn.Row(self.home, self.up, self.url, self.go, self.filter_sel), + self.main.panel, + ) + self.set_filters(self.filters) + self.go_clicked() + + def set_filters(self, filters=None): + self.filters = filters + if filters: + self.filter_sel.options = filters + self.filter_sel.value = filters + else: + self.filter_sel.options = [] + self.filter_sel.value = [] + + @property + def storage_options(self): + """Value of the kwargs box as a dictionary""" + return ast.literal_eval(self.kwargs.value) or {} + + @property + def fs(self): + """Current filesystem instance""" + if self._fs is None: + cls = get_filesystem_class(self.protocol.value) + self._fs = cls(**self.storage_options) + return self._fs + + @property + def urlpath(self): + """URL of currently selected item""" + return ( + (f"{self.protocol.value}://{self.main.value[0]}") + if self.main.value + else None + ) + + def open_file(self, mode="rb", compression=None, encoding=None): + """Create OpenFile instance for the currently selected item + + For example, in a notebook you might do something like + + .. code-block:: + + [ ]: sel = FileSelector(); sel + + # user selects their file + + [ ]: with sel.open_file('rb') as f: + ... out = f.read() + + Parameters + ---------- + mode: str (optional) + Open mode for the file. + compression: str (optional) + The interact with the file as compressed. Set to 'infer' to guess + compression from the file ending + encoding: str (optional) + If using text mode, use this encoding; defaults to UTF8. + """ + if self.urlpath is None: + raise ValueError("No file selected") + return OpenFile(self.fs, self.urlpath, mode, compression, encoding) + + def filters_changed(self, values): + self.filters = values + self.go_clicked() + + def selection_changed(self, *_): + if self.urlpath is None: + return + if self.fs.isdir(self.urlpath): + self.url.value = self.fs._strip_protocol(self.urlpath) + self.go_clicked() + + def go_clicked(self, *_): + if ( + self.prev_protocol != self.protocol.value + or self.prev_kwargs != self.storage_options + ): + self._fs = None # causes fs to be recreated + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + listing = sorted( + self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"] + ) + listing = [ + l + for l in listing + if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore) + ] + folders = { + "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "directory" + } + files = { + "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "file" + } + if self.filters: + files = { + k: v + for k, v in files.items() + if any(v.endswith(ext) for ext in self.filters) + } + self.main.set_options(dict(**folders, **files)) + + def protocol_changed(self, *_): + self._fs = None + self.main.options = [] + self.url.value = "" + + def home_clicked(self, *_): + self.protocol.value = self.init_protocol + self.kwargs.value = self.init_kwargs + self.url.value = self.init_url + self.go_clicked() + + def up_clicked(self, *_): + self.url.value = self.fs._parent(self.url.value) + self.go_clicked() diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..530df901a7225bab4afb9f08d06540bc18e91aef --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py @@ -0,0 +1,304 @@ +import errno +import io +import os +import secrets +import shutil +from contextlib import suppress +from functools import cached_property, wraps +from urllib.parse import parse_qs + +from fsspec.spec import AbstractFileSystem +from fsspec.utils import ( + get_package_version_without_import, + infer_storage_options, + mirror_from, + tokenize, +) + + +def wrap_exceptions(func): + @wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except OSError as exception: + if not exception.args: + raise + + message, *args = exception.args + if isinstance(message, str) and "does not exist" in message: + raise FileNotFoundError(errno.ENOENT, message) from exception + else: + raise + + return wrapper + + +PYARROW_VERSION = None + + +class ArrowFSWrapper(AbstractFileSystem): + """FSSpec-compatible wrapper of pyarrow.fs.FileSystem. + + Parameters + ---------- + fs : pyarrow.fs.FileSystem + + """ + + root_marker = "/" + + def __init__(self, fs, **kwargs): + global PYARROW_VERSION + PYARROW_VERSION = get_package_version_without_import("pyarrow") + self.fs = fs + super().__init__(**kwargs) + + @property + def protocol(self): + return self.fs.type_name + + @cached_property + def fsid(self): + return "hdfs_" + tokenize(self.fs.host, self.fs.port) + + @classmethod + def _strip_protocol(cls, path): + ops = infer_storage_options(path) + path = ops["path"] + if path.startswith("//"): + # special case for "hdfs://path" (without the triple slash) + path = path[1:] + return path + + def ls(self, path, detail=False, **kwargs): + path = self._strip_protocol(path) + from pyarrow.fs import FileSelector + + entries = [ + self._make_entry(entry) + for entry in self.fs.get_file_info(FileSelector(path)) + ] + if detail: + return entries + else: + return [entry["name"] for entry in entries] + + def info(self, path, **kwargs): + path = self._strip_protocol(path) + [info] = self.fs.get_file_info([path]) + return self._make_entry(info) + + def exists(self, path): + path = self._strip_protocol(path) + try: + self.info(path) + except FileNotFoundError: + return False + else: + return True + + def _make_entry(self, info): + from pyarrow.fs import FileType + + if info.type is FileType.Directory: + kind = "directory" + elif info.type is FileType.File: + kind = "file" + elif info.type is FileType.NotFound: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path) + else: + kind = "other" + + return { + "name": info.path, + "size": info.size, + "type": kind, + "mtime": info.mtime, + } + + @wrap_exceptions + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") + + with self._open(path1, "rb") as lstream: + tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}" + try: + with self.open(tmp_fname, "wb") as rstream: + shutil.copyfileobj(lstream, rstream) + self.fs.move(tmp_fname, path2) + except BaseException: + with suppress(FileNotFoundError): + self.fs.delete_file(tmp_fname) + raise + + @wrap_exceptions + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") + self.fs.move(path1, path2) + + @wrap_exceptions + def rm_file(self, path): + path = self._strip_protocol(path) + self.fs.delete_file(path) + + @wrap_exceptions + def rm(self, path, recursive=False, maxdepth=None): + path = self._strip_protocol(path).rstrip("/") + if self.isdir(path): + if recursive: + self.fs.delete_dir(path) + else: + raise ValueError("Can't delete directories without recursive=False") + else: + self.fs.delete_file(path) + + @wrap_exceptions + def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs): + if mode == "rb": + if seekable: + method = self.fs.open_input_file + else: + method = self.fs.open_input_stream + elif mode == "wb": + method = self.fs.open_output_stream + elif mode == "ab": + method = self.fs.open_append_stream + else: + raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}") + + _kwargs = {} + if mode != "rb" or not seekable: + if int(PYARROW_VERSION.split(".")[0]) >= 4: + # disable compression auto-detection + _kwargs["compression"] = None + stream = method(path, **_kwargs) + + return ArrowFile(self, stream, path, mode, block_size, **kwargs) + + @wrap_exceptions + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if create_parents: + self.makedirs(path, exist_ok=True) + else: + self.fs.create_dir(path, recursive=False) + + @wrap_exceptions + def makedirs(self, path, exist_ok=False): + path = self._strip_protocol(path) + self.fs.create_dir(path, recursive=True) + + @wrap_exceptions + def rmdir(self, path): + path = self._strip_protocol(path) + self.fs.delete_dir(path) + + @wrap_exceptions + def modified(self, path): + path = self._strip_protocol(path) + return self.fs.get_file_info(path).mtime + + def cat_file(self, path, start=None, end=None, **kwargs): + kwargs["seekable"] = start not in [None, 0] + return super().cat_file(path, start=None, end=None, **kwargs) + + def get_file(self, rpath, lpath, **kwargs): + kwargs["seekable"] = False + super().get_file(rpath, lpath, **kwargs) + + +@mirror_from( + "stream", + [ + "read", + "seek", + "tell", + "write", + "readable", + "writable", + "close", + "size", + "seekable", + ], +) +class ArrowFile(io.IOBase): + def __init__(self, fs, stream, path, mode, block_size=None, **kwargs): + self.path = path + self.mode = mode + + self.fs = fs + self.stream = stream + + self.blocksize = self.block_size = block_size + self.kwargs = kwargs + + def __enter__(self): + return self + + def __exit__(self, *args): + return self.close() + + +class HadoopFileSystem(ArrowFSWrapper): + """A wrapper on top of the pyarrow.fs.HadoopFileSystem + to connect it's interface with fsspec""" + + protocol = "hdfs" + + def __init__( + self, + host="default", + port=0, + user=None, + kerb_ticket=None, + replication=3, + extra_conf=None, + **kwargs, + ): + """ + + Parameters + ---------- + host: str + Hostname, IP or "default" to try to read from Hadoop config + port: int + Port to connect on, or default from Hadoop config if 0 + user: str or None + If given, connect as this username + kerb_ticket: str or None + If given, use this ticket for authentication + replication: int + set replication factor of file for write operations. default value is 3. + extra_conf: None or dict + Passed on to HadoopFileSystem + """ + from pyarrow.fs import HadoopFileSystem + + fs = HadoopFileSystem( + host=host, + port=port, + user=user, + kerb_ticket=kerb_ticket, + replication=replication, + extra_conf=extra_conf, + ) + super().__init__(fs=fs, **kwargs) + + @staticmethod + def _get_kwargs_from_urls(path): + ops = infer_storage_options(path) + out = {} + if ops.get("host", None): + out["host"] = ops["host"] + if ops.get("username", None): + out["user"] = ops["username"] + if ops.get("port", None): + out["port"] = ops["port"] + if ops.get("url_query", None): + queries = parse_qs(ops["url_query"]) + if queries.get("replication", None): + out["replication"] = int(queries["replication"][0]) + return out diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py new file mode 100644 index 0000000000000000000000000000000000000000..3e1276463db6866665e6a0fe114efc247971b57e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py @@ -0,0 +1,152 @@ +import dask +from distributed.client import Client, _get_global_client +from distributed.worker import Worker + +from fsspec import filesystem +from fsspec.spec import AbstractBufferedFile, AbstractFileSystem +from fsspec.utils import infer_storage_options + + +def _get_client(client): + if client is None: + return _get_global_client() + elif isinstance(client, Client): + return client + else: + # e.g., connection string + return Client(client) + + +def _in_worker(): + return bool(Worker._instances) + + +class DaskWorkerFileSystem(AbstractFileSystem): + """View files accessible to a worker as any other remote file-system + + When instances are run on the worker, uses the real filesystem. When + run on the client, they call the worker to provide information or data. + + **Warning** this implementation is experimental, and read-only for now. + """ + + def __init__( + self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs + ): + super().__init__(**kwargs) + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Please provide one of filesystem instance (fs) or" + " target_protocol, not both" + ) + self.target_protocol = target_protocol + self.target_options = target_options + self.worker = None + self.client = client + self.fs = fs + self._determine_worker() + + @staticmethod + def _get_kwargs_from_urls(path): + so = infer_storage_options(path) + if "host" in so and "port" in so: + return {"client": f"{so['host']}:{so['port']}"} + else: + return {} + + def _determine_worker(self): + if _in_worker(): + self.worker = True + if self.fs is None: + self.fs = filesystem( + self.target_protocol, **(self.target_options or {}) + ) + else: + self.worker = False + self.client = _get_client(self.client) + self.rfs = dask.delayed(self) + + def mkdir(self, *args, **kwargs): + if self.worker: + self.fs.mkdir(*args, **kwargs) + else: + self.rfs.mkdir(*args, **kwargs).compute() + + def rm(self, *args, **kwargs): + if self.worker: + self.fs.rm(*args, **kwargs) + else: + self.rfs.rm(*args, **kwargs).compute() + + def copy(self, *args, **kwargs): + if self.worker: + self.fs.copy(*args, **kwargs) + else: + self.rfs.copy(*args, **kwargs).compute() + + def mv(self, *args, **kwargs): + if self.worker: + self.fs.mv(*args, **kwargs) + else: + self.rfs.mv(*args, **kwargs).compute() + + def ls(self, *args, **kwargs): + if self.worker: + return self.fs.ls(*args, **kwargs) + else: + return self.rfs.ls(*args, **kwargs).compute() + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + if self.worker: + return self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + else: + return DaskFile( + fs=self, + path=path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + + def fetch_range(self, path, mode, start, end): + if self.worker: + with self._open(path, mode) as f: + f.seek(start) + return f.read(end - start) + else: + return self.rfs.fetch_range(path, mode, start, end).compute() + + +class DaskFile(AbstractBufferedFile): + def __init__(self, mode="rb", **kwargs): + if mode != "rb": + raise ValueError('Remote dask files can only be opened in "rb" mode') + super().__init__(**kwargs) + + def _upload_chunk(self, final=False): + pass + + def _initiate_upload(self): + """Create remote file/upload""" + pass + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + return self.fs.fetch_range(self.path, self.mode, start, end) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py new file mode 100644 index 0000000000000000000000000000000000000000..fa71e1a22c18ef77b4bd7c0fd651d92966db14af --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py @@ -0,0 +1,467 @@ +import base64 +import urllib + +import requests +import requests.exceptions +from requests.adapters import HTTPAdapter, Retry + +from fsspec import AbstractFileSystem +from fsspec.spec import AbstractBufferedFile + + +class DatabricksException(Exception): + """ + Helper class for exceptions raised in this module. + """ + + def __init__(self, error_code, message): + """Create a new DatabricksException""" + super().__init__(message) + + self.error_code = error_code + self.message = message + + +class DatabricksFileSystem(AbstractFileSystem): + """ + Get access to the Databricks filesystem implementation over HTTP. + Can be used inside and outside of a databricks cluster. + """ + + def __init__(self, instance, token, **kwargs): + """ + Create a new DatabricksFileSystem. + + Parameters + ---------- + instance: str + The instance URL of the databricks cluster. + For example for an Azure databricks cluster, this + has the form adb-..azuredatabricks.net. + token: str + Your personal token. Find out more + here: https://docs.databricks.com/dev-tools/api/latest/authentication.html + """ + self.instance = instance + self.token = token + self.session = requests.Session() + self.retries = Retry( + total=10, + backoff_factor=0.05, + status_forcelist=[408, 429, 500, 502, 503, 504], + ) + + self.session.mount("https://", HTTPAdapter(max_retries=self.retries)) + self.session.headers.update({"Authorization": f"Bearer {self.token}"}) + + super().__init__(**kwargs) + + def ls(self, path, detail=True, **kwargs): + """ + List the contents of the given path. + + Parameters + ---------- + path: str + Absolute path + detail: bool + Return not only the list of filenames, + but also additional information on file sizes + and types. + """ + out = self._ls_from_cache(path) + if not out: + try: + r = self._send_to_api( + method="get", endpoint="list", json={"path": path} + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) from e + + raise + files = r["files"] + out = [ + { + "name": o["path"], + "type": "directory" if o["is_dir"] else "file", + "size": o["file_size"], + } + for o in files + ] + self.dircache[path] = out + + if detail: + return out + return [o["name"] for o in out] + + def makedirs(self, path, exist_ok=True): + """ + Create a given absolute path and all of its parents. + + Parameters + ---------- + path: str + Absolute path to create + exist_ok: bool + If false, checks if the folder + exists before creating it (and raises an + Exception if this is the case) + """ + if not exist_ok: + try: + # If the following succeeds, the path is already present + self._send_to_api( + method="get", endpoint="get-status", json={"path": path} + ) + raise FileExistsError(f"Path {path} already exists") + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + pass + + try: + self._send_to_api(method="post", endpoint="mkdirs", json={"path": path}) + except DatabricksException as e: + if e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) from e + + raise + self.invalidate_cache(self._parent(path)) + + def mkdir(self, path, create_parents=True, **kwargs): + """ + Create a given absolute path and all of its parents. + + Parameters + ---------- + path: str + Absolute path to create + create_parents: bool + Whether to create all parents or not. + "False" is not implemented so far. + """ + if not create_parents: + raise NotImplementedError + + self.mkdirs(path, **kwargs) + + def rm(self, path, recursive=False, **kwargs): + """ + Remove the file or folder at the given absolute path. + + Parameters + ---------- + path: str + Absolute path what to remove + recursive: bool + Recursively delete all files in a folder. + """ + try: + self._send_to_api( + method="post", + endpoint="delete", + json={"path": path, "recursive": recursive}, + ) + except DatabricksException as e: + # This is not really an exception, it just means + # not everything was deleted so far + if e.error_code == "PARTIAL_DELETE": + self.rm(path=path, recursive=recursive) + elif e.error_code == "IO_ERROR": + # Using the same exception as the os module would use here + raise OSError(e.message) from e + + raise + self.invalidate_cache(self._parent(path)) + + def mv( + self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs + ): + """ + Move a source to a destination path. + + A note from the original [databricks API manual] + (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move). + + When moving a large number of files the API call will time out after + approximately 60s, potentially resulting in partially moved data. + Therefore, for operations that move more than 10k files, we strongly + discourage using the DBFS REST API. + + Parameters + ---------- + source_path: str + From where to move (absolute path) + destination_path: str + To where to move (absolute path) + recursive: bool + Not implemented to far. + maxdepth: + Not implemented to far. + """ + if recursive: + raise NotImplementedError + if maxdepth: + raise NotImplementedError + + try: + self._send_to_api( + method="post", + endpoint="move", + json={"source_path": source_path, "destination_path": destination_path}, + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) from e + elif e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) from e + + raise + self.invalidate_cache(self._parent(source_path)) + self.invalidate_cache(self._parent(destination_path)) + + def _open(self, path, mode="rb", block_size="default", **kwargs): + """ + Overwrite the base class method to make sure to create a DBFile. + All arguments are copied from the base method. + + Only the default blocksize is allowed. + """ + return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs) + + def _send_to_api(self, method, endpoint, json): + """ + Send the given json to the DBFS API + using a get or post request (specified by the argument `method`). + + Parameters + ---------- + method: str + Which http method to use for communication; "get" or "post". + endpoint: str + Where to send the request to (last part of the API URL) + json: dict + Dictionary of information to send + """ + if method == "post": + session_call = self.session.post + elif method == "get": + session_call = self.session.get + else: + raise ValueError(f"Do not understand method {method}") + + url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint) + + r = session_call(url, json=json) + + # The DBFS API will return a json, also in case of an exception. + # We want to preserve this information as good as possible. + try: + r.raise_for_status() + except requests.HTTPError as e: + # try to extract json error message + # if that fails, fall back to the original exception + try: + exception_json = e.response.json() + except Exception: + raise e from None + + raise DatabricksException(**exception_json) from e + + return r.json() + + def _create_handle(self, path, overwrite=True): + """ + Internal function to create a handle, which can be used to + write blocks of a file to DBFS. + A handle has a unique identifier which needs to be passed + whenever written during this transaction. + The handle is active for 10 minutes - after that a new + write transaction needs to be created. + Make sure to close the handle after you are finished. + + Parameters + ---------- + path: str + Absolute path for this file. + overwrite: bool + If a file already exist at this location, either overwrite + it or raise an exception. + """ + try: + r = self._send_to_api( + method="post", + endpoint="create", + json={"path": path, "overwrite": overwrite}, + ) + return r["handle"] + except DatabricksException as e: + if e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) from e + + raise + + def _close_handle(self, handle): + """ + Close a handle, which was opened by :func:`_create_handle`. + + Parameters + ---------- + handle: str + Which handle to close. + """ + try: + self._send_to_api(method="post", endpoint="close", json={"handle": handle}) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) from e + + raise + + def _add_data(self, handle, data): + """ + Upload data to an already opened file handle + (opened by :func:`_create_handle`). + The maximal allowed data size is 1MB after + conversion to base64. + Remember to close the handle when you are finished. + + Parameters + ---------- + handle: str + Which handle to upload data to. + data: bytes + Block of data to add to the handle. + """ + data = base64.b64encode(data).decode() + try: + self._send_to_api( + method="post", + endpoint="add-block", + json={"handle": handle, "data": data}, + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) from e + elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED": + raise ValueError(e.message) from e + + raise + + def _get_data(self, path, start, end): + """ + Download data in bytes from a given absolute path in a block + from [start, start+length]. + The maximum number of allowed bytes to read is 1MB. + + Parameters + ---------- + path: str + Absolute path to download data from + start: int + Start position of the block + end: int + End position of the block + """ + try: + r = self._send_to_api( + method="get", + endpoint="read", + json={"path": path, "offset": start, "length": end - start}, + ) + return base64.b64decode(r["data"]) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) from e + elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]: + raise ValueError(e.message) from e + + raise + + def invalidate_cache(self, path=None): + if path is None: + self.dircache.clear() + else: + self.dircache.pop(path, None) + super().invalidate_cache(path) + + +class DatabricksFile(AbstractBufferedFile): + """ + Helper class for files referenced in the DatabricksFileSystem. + """ + + DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + **kwargs, + ): + """ + Create a new instance of the DatabricksFile. + + The blocksize needs to be the default one. + """ + if block_size is None or block_size == "default": + block_size = self.DEFAULT_BLOCK_SIZE + + assert block_size == self.DEFAULT_BLOCK_SIZE, ( + f"Only the default block size is allowed, not {block_size}" + ) + + super().__init__( + fs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_type=cache_type, + cache_options=cache_options or {}, + **kwargs, + ) + + def _initiate_upload(self): + """Internal function to start a file upload""" + self.handle = self.fs._create_handle(self.path) + + def _upload_chunk(self, final=False): + """Internal function to add a chunk of data to a started upload""" + self.buffer.seek(0) + data = self.buffer.getvalue() + + data_chunks = [ + data[start:end] for start, end in self._to_sized_blocks(len(data)) + ] + + for data_chunk in data_chunks: + self.fs._add_data(handle=self.handle, data=data_chunk) + + if final: + self.fs._close_handle(handle=self.handle) + return True + + def _fetch_range(self, start, end): + """Internal function to download a block of data""" + return_buffer = b"" + length = end - start + for chunk_start, chunk_end in self._to_sized_blocks(length, start): + return_buffer += self.fs._get_data( + path=self.path, start=chunk_start, end=chunk_end + ) + + return return_buffer + + def _to_sized_blocks(self, length, start=0): + """Helper function to split a range from 0 to total_length into bloksizes""" + end = start + length + for data_chunk in range(start, end, self.blocksize): + data_start = data_chunk + data_end = min(end, data_chunk + self.blocksize) + yield data_start, data_end diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py new file mode 100644 index 0000000000000000000000000000000000000000..0bee864698ca14459349c766c2ef2fb0fa5454e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py @@ -0,0 +1,384 @@ +from .. import filesystem +from ..asyn import AsyncFileSystem + + +class DirFileSystem(AsyncFileSystem): + """Directory prefix filesystem + + The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with + is relative to the `path`. After performing the necessary paths operation it + delegates everything to the wrapped filesystem. + """ + + protocol = "dir" + + def __init__( + self, + path=None, + fs=None, + fo=None, + target_protocol=None, + target_options=None, + **storage_options, + ): + """ + Parameters + ---------- + path: str + Path to the directory. + fs: AbstractFileSystem + An instantiated filesystem to wrap. + target_protocol, target_options: + if fs is none, construct it from these + fo: str + Alternate for path; do not provide both + """ + super().__init__(**storage_options) + if fs is None: + fs = filesystem(protocol=target_protocol, **(target_options or {})) + if (path is not None) ^ (fo is not None) is False: + raise ValueError("Provide path or fo, not both") + path = path or fo + + if self.asynchronous and not fs.async_impl: + raise ValueError("can't use asynchronous with non-async fs") + + if fs.async_impl and self.asynchronous != fs.asynchronous: + raise ValueError("both dirfs and fs should be in the same sync/async mode") + + self.path = fs._strip_protocol(path) + self.fs = fs + + def _join(self, path): + if isinstance(path, str): + if not self.path: + return path + if not path: + return self.path + return self.fs.sep.join((self.path, self._strip_protocol(path))) + if isinstance(path, dict): + return {self._join(_path): value for _path, value in path.items()} + return [self._join(_path) for _path in path] + + def _relpath(self, path): + if isinstance(path, str): + if not self.path: + return path + # We need to account for S3FileSystem returning paths that do not + # start with a '/' + if path == self.path or ( + self.path.startswith(self.fs.sep) and path == self.path[1:] + ): + return "" + prefix = self.path + self.fs.sep + if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep): + prefix = prefix[1:] + assert path.startswith(prefix) + return path[len(prefix) :] + return [self._relpath(_path) for _path in path] + + # Wrappers below + + @property + def sep(self): + return self.fs.sep + + async def set_session(self, *args, **kwargs): + return await self.fs.set_session(*args, **kwargs) + + async def _rm_file(self, path, **kwargs): + return await self.fs._rm_file(self._join(path), **kwargs) + + def rm_file(self, path, **kwargs): + return self.fs.rm_file(self._join(path), **kwargs) + + async def _rm(self, path, *args, **kwargs): + return await self.fs._rm(self._join(path), *args, **kwargs) + + def rm(self, path, *args, **kwargs): + return self.fs.rm(self._join(path), *args, **kwargs) + + async def _cp_file(self, path1, path2, **kwargs): + return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs) + + def cp_file(self, path1, path2, **kwargs): + return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs) + + async def _copy( + self, + path1, + path2, + *args, + **kwargs, + ): + return await self.fs._copy( + self._join(path1), + self._join(path2), + *args, + **kwargs, + ) + + def copy(self, path1, path2, *args, **kwargs): + return self.fs.copy( + self._join(path1), + self._join(path2), + *args, + **kwargs, + ) + + async def _pipe(self, path, *args, **kwargs): + return await self.fs._pipe(self._join(path), *args, **kwargs) + + def pipe(self, path, *args, **kwargs): + return self.fs.pipe(self._join(path), *args, **kwargs) + + async def _pipe_file(self, path, *args, **kwargs): + return await self.fs._pipe_file(self._join(path), *args, **kwargs) + + def pipe_file(self, path, *args, **kwargs): + return self.fs.pipe_file(self._join(path), *args, **kwargs) + + async def _cat_file(self, path, *args, **kwargs): + return await self.fs._cat_file(self._join(path), *args, **kwargs) + + def cat_file(self, path, *args, **kwargs): + return self.fs.cat_file(self._join(path), *args, **kwargs) + + async def _cat(self, path, *args, **kwargs): + ret = await self.fs._cat( + self._join(path), + *args, + **kwargs, + ) + + if isinstance(ret, dict): + return {self._relpath(key): value for key, value in ret.items()} + + return ret + + def cat(self, path, *args, **kwargs): + ret = self.fs.cat( + self._join(path), + *args, + **kwargs, + ) + + if isinstance(ret, dict): + return {self._relpath(key): value for key, value in ret.items()} + + return ret + + async def _put_file(self, lpath, rpath, **kwargs): + return await self.fs._put_file(lpath, self._join(rpath), **kwargs) + + def put_file(self, lpath, rpath, **kwargs): + return self.fs.put_file(lpath, self._join(rpath), **kwargs) + + async def _put( + self, + lpath, + rpath, + *args, + **kwargs, + ): + return await self.fs._put( + lpath, + self._join(rpath), + *args, + **kwargs, + ) + + def put(self, lpath, rpath, *args, **kwargs): + return self.fs.put( + lpath, + self._join(rpath), + *args, + **kwargs, + ) + + async def _get_file(self, rpath, lpath, **kwargs): + return await self.fs._get_file(self._join(rpath), lpath, **kwargs) + + def get_file(self, rpath, lpath, **kwargs): + return self.fs.get_file(self._join(rpath), lpath, **kwargs) + + async def _get(self, rpath, *args, **kwargs): + return await self.fs._get(self._join(rpath), *args, **kwargs) + + def get(self, rpath, *args, **kwargs): + return self.fs.get(self._join(rpath), *args, **kwargs) + + async def _isfile(self, path): + return await self.fs._isfile(self._join(path)) + + def isfile(self, path): + return self.fs.isfile(self._join(path)) + + async def _isdir(self, path): + return await self.fs._isdir(self._join(path)) + + def isdir(self, path): + return self.fs.isdir(self._join(path)) + + async def _size(self, path): + return await self.fs._size(self._join(path)) + + def size(self, path): + return self.fs.size(self._join(path)) + + async def _exists(self, path): + return await self.fs._exists(self._join(path)) + + def exists(self, path): + return self.fs.exists(self._join(path)) + + async def _info(self, path, **kwargs): + return await self.fs._info(self._join(path), **kwargs) + + def info(self, path, **kwargs): + return self.fs.info(self._join(path), **kwargs) + + async def _ls(self, path, detail=True, **kwargs): + ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy() + if detail: + out = [] + for entry in ret: + entry = entry.copy() + entry["name"] = self._relpath(entry["name"]) + out.append(entry) + return out + + return self._relpath(ret) + + def ls(self, path, detail=True, **kwargs): + ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy() + if detail: + out = [] + for entry in ret: + entry = entry.copy() + entry["name"] = self._relpath(entry["name"]) + out.append(entry) + return out + + return self._relpath(ret) + + async def _walk(self, path, *args, **kwargs): + async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs): + yield self._relpath(root), dirs, files + + def walk(self, path, *args, **kwargs): + for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs): + yield self._relpath(root), dirs, files + + async def _glob(self, path, **kwargs): + detail = kwargs.get("detail", False) + ret = await self.fs._glob(self._join(path), **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + def glob(self, path, **kwargs): + detail = kwargs.get("detail", False) + ret = self.fs.glob(self._join(path), **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + async def _du(self, path, *args, **kwargs): + total = kwargs.get("total", True) + ret = await self.fs._du(self._join(path), *args, **kwargs) + if total: + return ret + + return {self._relpath(path): size for path, size in ret.items()} + + def du(self, path, *args, **kwargs): + total = kwargs.get("total", True) + ret = self.fs.du(self._join(path), *args, **kwargs) + if total: + return ret + + return {self._relpath(path): size for path, size in ret.items()} + + async def _find(self, path, *args, **kwargs): + detail = kwargs.get("detail", False) + ret = await self.fs._find(self._join(path), *args, **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + def find(self, path, *args, **kwargs): + detail = kwargs.get("detail", False) + ret = self.fs.find(self._join(path), *args, **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + async def _expand_path(self, path, *args, **kwargs): + return self._relpath( + await self.fs._expand_path(self._join(path), *args, **kwargs) + ) + + def expand_path(self, path, *args, **kwargs): + return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs)) + + async def _mkdir(self, path, *args, **kwargs): + return await self.fs._mkdir(self._join(path), *args, **kwargs) + + def mkdir(self, path, *args, **kwargs): + return self.fs.mkdir(self._join(path), *args, **kwargs) + + async def _makedirs(self, path, *args, **kwargs): + return await self.fs._makedirs(self._join(path), *args, **kwargs) + + def makedirs(self, path, *args, **kwargs): + return self.fs.makedirs(self._join(path), *args, **kwargs) + + def rmdir(self, path): + return self.fs.rmdir(self._join(path)) + + def mv(self, path1, path2, **kwargs): + return self.fs.mv( + self._join(path1), + self._join(path2), + **kwargs, + ) + + def touch(self, path, **kwargs): + return self.fs.touch(self._join(path), **kwargs) + + def created(self, path): + return self.fs.created(self._join(path)) + + def modified(self, path): + return self.fs.modified(self._join(path)) + + def sign(self, path, *args, **kwargs): + return self.fs.sign(self._join(path), *args, **kwargs) + + def __repr__(self): + return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})" + + def open( + self, + path, + *args, + **kwargs, + ): + return self.fs.open( + self._join(path), + *args, + **kwargs, + ) + + async def open_async( + self, + path, + *args, + **kwargs, + ): + return await self.fs.open_async( + self._join(path), + *args, + **kwargs, + ) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py new file mode 100644 index 0000000000000000000000000000000000000000..2839f4c1feea56dddd54bdc00f0b884c8461d29e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py @@ -0,0 +1,124 @@ +import base64 +import io +import re + +import requests + +import fsspec + + +class JupyterFileSystem(fsspec.AbstractFileSystem): + """View of the files as seen by a Jupyter server (notebook or lab)""" + + protocol = ("jupyter", "jlab") + + def __init__(self, url, tok=None, **kwargs): + """ + + Parameters + ---------- + url : str + Base URL of the server, like "http://127.0.0.1:8888". May include + token in the string, which is given by the process when starting up + tok : str + If the token is obtained separately, can be given here + kwargs + """ + if "?" in url: + if tok is None: + try: + tok = re.findall("token=([a-z0-9]+)", url)[0] + except IndexError as e: + raise ValueError("Could not determine token") from e + url = url.split("?", 1)[0] + self.url = url.rstrip("/") + "/api/contents" + self.session = requests.Session() + if tok: + self.session.headers["Authorization"] = f"token {tok}" + + super().__init__(**kwargs) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + r = self.session.get(f"{self.url}/{path}") + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + + if out["type"] == "directory": + out = out["content"] + else: + out = [out] + for o in out: + o["name"] = o.pop("path") + o.pop("content") + if o["type"] == "notebook": + o["type"] = "file" + if detail: + return out + return [o["name"] for o in out] + + def cat_file(self, path, start=None, end=None, **kwargs): + path = self._strip_protocol(path) + r = self.session.get(f"{self.url}/{path}") + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + if out["format"] == "text": + # data should be binary + b = out["content"].encode() + else: + b = base64.b64decode(out["content"]) + return b[start:end] + + def pipe_file(self, path, value, **_): + path = self._strip_protocol(path) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": len(value), + "content": base64.b64encode(value).decode(), + "format": "base64", + "type": "file", + } + self.session.put(f"{self.url}/{path}", json=json) + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if create_parents and "/" in path: + self.mkdir(path.rsplit("/", 1)[0], True) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": None, + "content": None, + "type": "directory", + } + self.session.put(f"{self.url}/{path}", json=json) + + def _rm(self, path): + path = self._strip_protocol(path) + self.session.delete(f"{self.url}/{path}") + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + if mode == "rb": + data = self.cat_file(path) + return io.BytesIO(data) + else: + return SimpleFileWriter(self, path, mode="wb") + + +class SimpleFileWriter(fsspec.spec.AbstractBufferedFile): + def _upload_chunk(self, final=False): + """Never uploads a chunk until file is done + + Not suitable for large files + """ + if final is False: + return False + self.buffer.seek(0) + data = self.buffer.read() + self.fs.pipe_file(self.path, data) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/local.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/local.py new file mode 100644 index 0000000000000000000000000000000000000000..4c588232fe4260db6bad0603daef02af0cceaa3e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/local.py @@ -0,0 +1,476 @@ +import datetime +import io +import logging +import os +import os.path as osp +import shutil +import stat +import tempfile + +from fsspec import AbstractFileSystem +from fsspec.compression import compr +from fsspec.core import get_compression +from fsspec.utils import isfilelike, stringify_path + +logger = logging.getLogger("fsspec.local") + + +class LocalFileSystem(AbstractFileSystem): + """Interface to files on local storage + + Parameters + ---------- + auto_mkdir: bool + Whether, when opening a file, the directory containing it should + be created (if it doesn't already exist). This is assumed by pyarrow + code. + """ + + root_marker = "/" + protocol = "file", "local" + local_file = True + + def __init__(self, auto_mkdir=False, **kwargs): + super().__init__(**kwargs) + self.auto_mkdir = auto_mkdir + + @property + def fsid(self): + return "local" + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if self.exists(path): + raise FileExistsError(path) + if create_parents: + self.makedirs(path, exist_ok=True) + else: + os.mkdir(path, **kwargs) + + def makedirs(self, path, exist_ok=False): + path = self._strip_protocol(path) + os.makedirs(path, exist_ok=exist_ok) + + def rmdir(self, path): + path = self._strip_protocol(path) + os.rmdir(path) + + def ls(self, path, detail=False, **kwargs): + path = self._strip_protocol(path) + info = self.info(path) + if info["type"] == "directory": + with os.scandir(path) as it: + infos = [] + for f in it: + try: + infos.append(self.info(f)) + except FileNotFoundError: + pass + else: + infos = [info] + + if not detail: + return [i["name"] for i in infos] + return infos + + def info(self, path, **kwargs): + if isinstance(path, os.DirEntry): + # scandir DirEntry + out = path.stat(follow_symlinks=False) + link = path.is_symlink() + if path.is_dir(follow_symlinks=False): + t = "directory" + elif path.is_file(follow_symlinks=False): + t = "file" + else: + t = "other" + + size = out.st_size + if link: + try: + out2 = path.stat(follow_symlinks=True) + size = out2.st_size + except OSError: + size = 0 + path = self._strip_protocol(path.path) + else: + # str or path-like + path = self._strip_protocol(path) + out = os.stat(path, follow_symlinks=False) + link = stat.S_ISLNK(out.st_mode) + if link: + out = os.stat(path, follow_symlinks=True) + size = out.st_size + if stat.S_ISDIR(out.st_mode): + t = "directory" + elif stat.S_ISREG(out.st_mode): + t = "file" + else: + t = "other" + result = { + "name": path, + "size": size, + "type": t, + "created": out.st_ctime, + "islink": link, + } + for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]: + result[field] = getattr(out, f"st_{field}") + if link: + result["destination"] = os.readlink(path) + return result + + def lexists(self, path, **kwargs): + return osp.lexists(path) + + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + if self.auto_mkdir: + self.makedirs(self._parent(path2), exist_ok=True) + if self.isfile(path1): + shutil.copyfile(path1, path2) + elif self.isdir(path1): + self.mkdirs(path2, exist_ok=True) + else: + raise FileNotFoundError(path1) + + def isfile(self, path): + path = self._strip_protocol(path) + return os.path.isfile(path) + + def isdir(self, path): + path = self._strip_protocol(path) + return os.path.isdir(path) + + def get_file(self, path1, path2, callback=None, **kwargs): + if isfilelike(path2): + with open(path1, "rb") as f: + shutil.copyfileobj(f, path2) + else: + return self.cp_file(path1, path2, **kwargs) + + def put_file(self, path1, path2, callback=None, **kwargs): + return self.cp_file(path1, path2, **kwargs) + + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + shutil.move(path1, path2) + + def link(self, src, dst, **kwargs): + src = self._strip_protocol(src) + dst = self._strip_protocol(dst) + os.link(src, dst, **kwargs) + + def symlink(self, src, dst, **kwargs): + src = self._strip_protocol(src) + dst = self._strip_protocol(dst) + os.symlink(src, dst, **kwargs) + + def islink(self, path) -> bool: + return os.path.islink(self._strip_protocol(path)) + + def rm_file(self, path): + os.remove(self._strip_protocol(path)) + + def rm(self, path, recursive=False, maxdepth=None): + if not isinstance(path, list): + path = [path] + + for p in path: + p = self._strip_protocol(p) + if self.isdir(p): + if not recursive: + raise ValueError("Cannot delete directory, set recursive=True") + if osp.abspath(p) == os.getcwd(): + raise ValueError("Cannot delete current working directory") + shutil.rmtree(p) + else: + os.remove(p) + + def unstrip_protocol(self, name): + name = self._strip_protocol(name) # normalise for local/win/... + return f"file://{name}" + + def _open(self, path, mode="rb", block_size=None, **kwargs): + path = self._strip_protocol(path) + if self.auto_mkdir and "w" in mode: + self.makedirs(self._parent(path), exist_ok=True) + return LocalFileOpener(path, mode, fs=self, **kwargs) + + def touch(self, path, truncate=True, **kwargs): + path = self._strip_protocol(path) + if self.auto_mkdir: + self.makedirs(self._parent(path), exist_ok=True) + if self.exists(path): + os.utime(path, None) + else: + open(path, "a").close() + if truncate: + os.truncate(path, 0) + + def created(self, path): + info = self.info(path=path) + return datetime.datetime.fromtimestamp( + info["created"], tz=datetime.timezone.utc + ) + + def modified(self, path): + info = self.info(path=path) + return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc) + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if os.sep == "/": + # posix native + return path.rsplit("/", 1)[0] or "/" + else: + # NT + path_ = path.rsplit("/", 1)[0] + if len(path_) <= 3: + if path_[1:2] == ":": + # nt root (something like c:/) + return path_[0] + ":/" + # More cases may be required here + return path_ + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("file://"): + path = path[7:] + elif path.startswith("file:"): + path = path[5:] + elif path.startswith("local://"): + path = path[8:] + elif path.startswith("local:"): + path = path[6:] + + path = make_path_posix(path) + if os.sep != "/": + # This code-path is a stripped down version of + # > drive, path = ntpath.splitdrive(path) + if path[1:2] == ":": + # Absolute drive-letter path, e.g. X:\Windows + # Relative path with drive, e.g. X:Windows + drive, path = path[:2], path[2:] + elif path[:2] == "//": + # UNC drives, e.g. \\server\share or \\?\UNC\server\share + # Device drives, e.g. \\.\device or \\?\device + if (index1 := path.find("/", 2)) == -1 or ( + index2 := path.find("/", index1 + 1) + ) == -1: + drive, path = path, "" + else: + drive, path = path[:index2], path[index2:] + else: + # Relative path, e.g. Windows + drive = "" + + path = path.rstrip("/") or cls.root_marker + return drive + path + + else: + return path.rstrip("/") or cls.root_marker + + def _isfilestore(self): + # Inheriting from DaskFileSystem makes this False (S3, etc. were) + # the original motivation. But we are a posix-like file system. + # See https://github.com/dask/dask/issues/5526 + return True + + def chmod(self, path, mode): + path = stringify_path(path) + return os.chmod(path, mode) + + +def make_path_posix(path): + """Make path generic and absolute for current OS""" + if not isinstance(path, str): + if isinstance(path, (list, set, tuple)): + return type(path)(make_path_posix(p) for p in path) + else: + path = stringify_path(path) + if not isinstance(path, str): + raise TypeError(f"could not convert {path!r} to string") + if os.sep == "/": + # Native posix + if path.startswith("/"): + # most common fast case for posix + return path + elif path.startswith("~"): + return osp.expanduser(path) + elif path.startswith("./"): + path = path[2:] + elif path == ".": + path = "" + return f"{os.getcwd()}/{path}" + else: + # NT handling + if path[0:1] == "/" and path[2:3] == ":": + # path is like "/c:/local/path" + path = path[1:] + if path[1:2] == ":": + # windows full path like "C:\\local\\path" + if len(path) <= 3: + # nt root (something like c:/) + return path[0] + ":/" + path = path.replace("\\", "/") + return path + elif path[0:1] == "~": + return make_path_posix(osp.expanduser(path)) + elif path.startswith(("\\\\", "//")): + # windows UNC/DFS-style paths + return "//" + path[2:].replace("\\", "/") + elif path.startswith(("\\", "/")): + # windows relative path with root + path = path.replace("\\", "/") + return f"{osp.splitdrive(os.getcwd())[0]}{path}" + else: + path = path.replace("\\", "/") + if path.startswith("./"): + path = path[2:] + elif path == ".": + path = "" + return f"{make_path_posix(os.getcwd())}/{path}" + + +def trailing_sep(path): + """Return True if the path ends with a path separator. + + A forward slash is always considered a path separator, even on Operating + Systems that normally use a backslash. + """ + # TODO: if all incoming paths were posix-compliant then separator would + # always be a forward slash, simplifying this function. + # See https://github.com/fsspec/filesystem_spec/pull/1250 + return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep)) + + +class LocalFileOpener(io.IOBase): + def __init__( + self, path, mode, autocommit=True, fs=None, compression=None, **kwargs + ): + logger.debug("open file: %s", path) + self.path = path + self.mode = mode + self.fs = fs + self.f = None + self.autocommit = autocommit + self.compression = get_compression(path, compression) + self.blocksize = io.DEFAULT_BUFFER_SIZE + self._open() + + def _open(self): + if self.f is None or self.f.closed: + if self.autocommit or "w" not in self.mode: + self.f = open(self.path, mode=self.mode) + if self.compression: + compress = compr[self.compression] + self.f = compress(self.f, mode=self.mode) + else: + # TODO: check if path is writable? + i, name = tempfile.mkstemp() + os.close(i) # we want normal open and normal buffered file + self.temp = name + self.f = open(name, mode=self.mode) + if "w" not in self.mode: + self.size = self.f.seek(0, 2) + self.f.seek(0) + self.f.size = self.size + + def _fetch_range(self, start, end): + # probably only used by cached FS + if "r" not in self.mode: + raise ValueError + self._open() + self.f.seek(start) + return self.f.read(end - start) + + def __setstate__(self, state): + self.f = None + loc = state.pop("loc", None) + self.__dict__.update(state) + if "r" in state["mode"]: + self.f = None + self._open() + self.f.seek(loc) + + def __getstate__(self): + d = self.__dict__.copy() + d.pop("f") + if "r" in self.mode: + d["loc"] = self.f.tell() + else: + if not self.f.closed: + raise ValueError("Cannot serialise open write-mode local file") + return d + + def commit(self): + if self.autocommit: + raise RuntimeError("Can only commit if not already set to autocommit") + shutil.move(self.temp, self.path) + + def discard(self): + if self.autocommit: + raise RuntimeError("Cannot discard if set to autocommit") + os.remove(self.temp) + + def readable(self) -> bool: + return True + + def writable(self) -> bool: + return "r" not in self.mode + + def read(self, *args, **kwargs): + return self.f.read(*args, **kwargs) + + def write(self, *args, **kwargs): + return self.f.write(*args, **kwargs) + + def tell(self, *args, **kwargs): + return self.f.tell(*args, **kwargs) + + def seek(self, *args, **kwargs): + return self.f.seek(*args, **kwargs) + + def seekable(self, *args, **kwargs): + return self.f.seekable(*args, **kwargs) + + def readline(self, *args, **kwargs): + return self.f.readline(*args, **kwargs) + + def readlines(self, *args, **kwargs): + return self.f.readlines(*args, **kwargs) + + def close(self): + return self.f.close() + + def truncate(self, size=None) -> int: + return self.f.truncate(size) + + @property + def closed(self): + return self.f.closed + + def fileno(self): + return self.raw.fileno() + + def flush(self) -> None: + self.f.flush() + + def __iter__(self): + return self.f.__iter__() + + def __getattr__(self, item): + return getattr(self.f, item) + + def __enter__(self): + self._incontext = True + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._incontext = False + self.f.__exit__(exc_type, exc_value, traceback) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py new file mode 100644 index 0000000000000000000000000000000000000000..9cf529b2329d3d62a94c162e7aacea6ba4699c8e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py @@ -0,0 +1,1306 @@ +import base64 +import collections +import io +import itertools +import logging +import math +import os +from functools import lru_cache +from itertools import chain +from typing import TYPE_CHECKING, Literal + +import fsspec.core +from fsspec.spec import AbstractBufferedFile + +try: + import ujson as json +except ImportError: + if not TYPE_CHECKING: + import json + +from fsspec.asyn import AsyncFileSystem +from fsspec.callbacks import DEFAULT_CALLBACK +from fsspec.core import filesystem, open, split_protocol +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper +from fsspec.utils import isfilelike, merge_offset_ranges, other_paths + +logger = logging.getLogger("fsspec.reference") + + +class ReferenceNotReachable(RuntimeError): + def __init__(self, reference, target, *args): + super().__init__(*args) + self.reference = reference + self.target = target + + def __str__(self): + return f'Reference "{self.reference}" failed to fetch target {self.target}' + + +def _first(d): + return next(iter(d.values())) + + +def _prot_in_references(path, references): + ref = references.get(path) + if isinstance(ref, (list, tuple)) and isinstance(ref[0], str): + return split_protocol(ref[0])[0] if ref[0] else ref[0] + + +def _protocol_groups(paths, references): + if isinstance(paths, str): + return {_prot_in_references(paths, references): [paths]} + out = {} + for path in paths: + protocol = _prot_in_references(path, references) + out.setdefault(protocol, []).append(path) + return out + + +class RefsValuesView(collections.abc.ValuesView): + def __iter__(self): + for val in self._mapping.zmetadata.values(): + yield json.dumps(val).encode() + yield from self._mapping._items.values() + for field in self._mapping.listdir(): + chunk_sizes = self._mapping._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + yield self._mapping[field + "/0"] + continue + yield from self._mapping._generate_all_records(field) + + +class RefsItemsView(collections.abc.ItemsView): + def __iter__(self): + return zip(self._mapping.keys(), self._mapping.values()) + + +def ravel_multi_index(idx, sizes): + val = 0 + mult = 1 + for i, s in zip(idx[::-1], sizes[::-1]): + val += i * mult + mult *= s + return val + + +class LazyReferenceMapper(collections.abc.MutableMapping): + """This interface can be used to read/write references from Parquet stores. + It is not intended for other types of references. + It can be used with Kerchunk's MultiZarrToZarr method to combine + references into a parquet store. + Examples of this use-case can be found here: + https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage""" + + # import is class level to prevent numpy dep requirement for fsspec + @property + def np(self): + import numpy as np + + return np + + @property + def pd(self): + import pandas as pd + + return pd + + def __init__( + self, + root, + fs=None, + out_root=None, + cache_size=128, + categorical_threshold=10, + engine: Literal["fastparquet", "pyarrow"] = "fastparquet", + ): + """ + + This instance will be writable, storing changes in memory until full partitions + are accumulated or .flush() is called. + + To create an empty lazy store, use .create() + + Parameters + ---------- + root : str + Root of parquet store + fs : fsspec.AbstractFileSystem + fsspec filesystem object, default is local filesystem. + cache_size : int, default=128 + Maximum size of LRU cache, where cache_size*record_size denotes + the total number of references that can be loaded in memory at once. + categorical_threshold : int + Encode urls as pandas.Categorical to reduce memory footprint if the ratio + of the number of unique urls to total number of refs for each variable + is greater than or equal to this number. (default 10) + engine: Literal["fastparquet","pyarrow"] + Engine choice for reading parquet files. (default is "fastparquet") + """ + + self.root = root + self.chunk_sizes = {} + self.out_root = out_root or self.root + self.cat_thresh = categorical_threshold + self.engine = engine + self.cache_size = cache_size + self.url = self.root + "/{field}/refs.{record}.parq" + # TODO: derive fs from `root` + self.fs = fsspec.filesystem("file") if fs is None else fs + + from importlib.util import find_spec + + if self.engine == "pyarrow" and find_spec("pyarrow") is None: + raise ImportError("engine choice `pyarrow` is not installed.") + + def __getattr__(self, item): + if item in ("_items", "record_size", "zmetadata"): + self.setup() + # avoid possible recursion if setup fails somehow + return self.__dict__[item] + raise AttributeError(item) + + def setup(self): + self._items = {} + self._items[".zmetadata"] = self.fs.cat_file( + "/".join([self.root, ".zmetadata"]) + ) + met = json.loads(self._items[".zmetadata"]) + self.record_size = met["record_size"] + self.zmetadata = met["metadata"] + + # Define function to open and decompress refs + @lru_cache(maxsize=self.cache_size) + def open_refs(field, record): + """cached parquet file loader""" + path = self.url.format(field=field, record=record) + data = io.BytesIO(self.fs.cat_file(path)) + try: + df = self.pd.read_parquet(data, engine=self.engine) + refs = {c: df[c].to_numpy() for c in df.columns} + except OSError: + refs = None + return refs + + self.open_refs = open_refs + + @staticmethod + def create(root, storage_options=None, fs=None, record_size=10000, **kwargs): + """Make empty parquet reference set + + First deletes the contents of the given directory, if it exists. + + Parameters + ---------- + root: str + Directory to contain the output; will be created + storage_options: dict | None + For making the filesystem to use for writing is fs is None + fs: FileSystem | None + Filesystem for writing + record_size: int + Number of references per parquet file + kwargs: passed to __init__ + + Returns + ------- + LazyReferenceMapper instance + """ + met = {"metadata": {}, "record_size": record_size} + if fs is None: + fs, root = fsspec.core.url_to_fs(root, **(storage_options or {})) + if fs.exists(root): + fs.rm(root, recursive=True) + fs.makedirs(root, exist_ok=True) + fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode()) + return LazyReferenceMapper(root, fs, **kwargs) + + @lru_cache() + def listdir(self): + """List top-level directories""" + dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z")) + return set(dirs) + + def ls(self, path="", detail=True): + """Shortcut file listings""" + path = path.rstrip("/") + pathdash = path + "/" if path else "" + dirnames = self.listdir() + dirs = [ + d + for d in dirnames + if d.startswith(pathdash) and "/" not in d.lstrip(pathdash) + ] + if dirs: + others = { + f + for f in chain( + [".zmetadata"], + (name for name in self.zmetadata), + (name for name in self._items), + ) + if f.startswith(pathdash) and "/" not in f.lstrip(pathdash) + } + if detail is False: + others.update(dirs) + return sorted(others) + dirinfo = [{"name": name, "type": "directory", "size": 0} for name in dirs] + fileinfo = [ + { + "name": name, + "type": "file", + "size": len( + json.dumps(self.zmetadata[name]) + if name in self.zmetadata + else self._items[name] + ), + } + for name in others + ] + return sorted(dirinfo + fileinfo, key=lambda s: s["name"]) + field = path + others = set( + [name for name in self.zmetadata if name.startswith(f"{path}/")] + + [name for name in self._items if name.startswith(f"{path}/")] + ) + fileinfo = [ + { + "name": name, + "type": "file", + "size": len( + json.dumps(self.zmetadata[name]) + if name in self.zmetadata + else self._items[name] + ), + } + for name in others + ] + keys = self._keys_in_field(field) + + if detail is False: + return list(others) + list(keys) + recs = self._generate_all_records(field) + recinfo = [ + {"name": name, "type": "file", "size": rec[-1]} + for name, rec in zip(keys, recs) + if rec[0] # filters out path==None, deleted/missing + ] + return fileinfo + recinfo + + def _load_one_key(self, key): + """Get the reference for one key + + Returns bytes, one-element list or three-element list. + """ + if key in self._items: + return self._items[key] + elif key in self.zmetadata: + return json.dumps(self.zmetadata[key]).encode() + elif "/" not in key or self._is_meta(key): + raise KeyError(key) + field, _ = key.rsplit("/", 1) + record, ri, chunk_size = self._key_to_record(key) + maybe = self._items.get((field, record), {}).get(ri, False) + if maybe is None: + # explicitly deleted + raise KeyError + elif maybe: + return maybe + elif chunk_size == 0: + return b"" + + # Chunk keys can be loaded from row group and cached in LRU cache + try: + refs = self.open_refs(field, record) + except (ValueError, TypeError, FileNotFoundError) as exc: + raise KeyError(key) from exc + columns = ["path", "offset", "size", "raw"] + selection = [refs[c][ri] if c in refs else None for c in columns] + raw = selection[-1] + if raw is not None: + return raw + if selection[0] is None: + raise KeyError("This reference does not exist or has been deleted") + if selection[1:3] == [0, 0]: + # URL only + return selection[:1] + # URL, offset, size + return selection[:3] + + @lru_cache(4096) + def _key_to_record(self, key): + """Details needed to construct a reference for one key""" + field, chunk = key.rsplit("/", 1) + chunk_sizes = self._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + return 0, 0, 0 + chunk_idx = [int(c) for c in chunk.split(".")] + chunk_number = ravel_multi_index(chunk_idx, chunk_sizes) + record = chunk_number // self.record_size + ri = chunk_number % self.record_size + return record, ri, len(chunk_sizes) + + def _get_chunk_sizes(self, field): + """The number of chunks along each axis for a given field""" + if field not in self.chunk_sizes: + zarray = self.zmetadata[f"{field}/.zarray"] + size_ratio = [ + math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"]) + ] + self.chunk_sizes[field] = size_ratio or [1] + return self.chunk_sizes[field] + + def _generate_record(self, field, record): + """The references for a given parquet file of a given field""" + refs = self.open_refs(field, record) + it = iter(zip(*refs.values())) + if len(refs) == 3: + # All urls + return (list(t) for t in it) + elif len(refs) == 1: + # All raws + return refs["raw"] + else: + # Mix of urls and raws + return (list(t[:3]) if not t[3] else t[3] for t in it) + + def _generate_all_records(self, field): + """Load all the references within a field by iterating over the parquet files""" + nrec = 1 + for ch in self._get_chunk_sizes(field): + nrec *= ch + nrec = math.ceil(nrec / self.record_size) + for record in range(nrec): + yield from self._generate_record(field, record) + + def values(self): + return RefsValuesView(self) + + def items(self): + return RefsItemsView(self) + + def __hash__(self): + return id(self) + + def __getitem__(self, key): + return self._load_one_key(key) + + def __setitem__(self, key, value): + if "/" in key and not self._is_meta(key): + field, chunk = key.rsplit("/", 1) + record, i, _ = self._key_to_record(key) + subdict = self._items.setdefault((field, record), {}) + subdict[i] = value + if len(subdict) == self.record_size: + self.write(field, record) + else: + # metadata or top-level + if hasattr(value, "to_bytes"): + val = value.to_bytes().decode() + elif isinstance(value, bytes): + val = value.decode() + else: + val = value + self._items[key] = val + new_value = json.loads(val) + self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value} + + @staticmethod + def _is_meta(key): + return key.startswith(".z") or "/.z" in key + + def __delitem__(self, key): + if key in self._items: + del self._items[key] + elif key in self.zmetadata: + del self.zmetadata[key] + else: + if "/" in key and not self._is_meta(key): + field, _ = key.rsplit("/", 1) + record, i, _ = self._key_to_record(key) + subdict = self._items.setdefault((field, record), {}) + subdict[i] = None + if len(subdict) == self.record_size: + self.write(field, record) + else: + # metadata or top-level + self._items[key] = None + + def write(self, field, record, base_url=None, storage_options=None): + # extra requirements if writing + import kerchunk.df + import numpy as np + import pandas as pd + + partition = self._items[(field, record)] + original = False + if len(partition) < self.record_size: + try: + original = self.open_refs(field, record) + except OSError: + pass + + if original: + paths = original["path"] + offsets = original["offset"] + sizes = original["size"] + raws = original["raw"] + else: + paths = np.full(self.record_size, np.nan, dtype="O") + offsets = np.zeros(self.record_size, dtype="int64") + sizes = np.zeros(self.record_size, dtype="int64") + raws = np.full(self.record_size, np.nan, dtype="O") + for j, data in partition.items(): + if isinstance(data, list): + if ( + str(paths.dtype) == "category" + and data[0] not in paths.dtype.categories + ): + paths = paths.add_categories(data[0]) + paths[j] = data[0] + if len(data) > 1: + offsets[j] = data[1] + sizes[j] = data[2] + elif data is None: + # delete + paths[j] = None + offsets[j] = 0 + sizes[j] = 0 + raws[j] = None + else: + # this is the only call into kerchunk, could remove + raws[j] = kerchunk.df._proc_raw(data) + # TODO: only save needed columns + df = pd.DataFrame( + { + "path": paths, + "offset": offsets, + "size": sizes, + "raw": raws, + }, + copy=False, + ) + if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh: + df["path"] = df["path"].astype("category") + object_encoding = {"raw": "bytes", "path": "utf8"} + has_nulls = ["path", "raw"] + + fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq" + self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True) + + if self.engine == "pyarrow": + df_backend_kwargs = {"write_statistics": False} + elif self.engine == "fastparquet": + df_backend_kwargs = { + "stats": False, + "object_encoding": object_encoding, + "has_nulls": has_nulls, + } + else: + raise NotImplementedError(f"{self.engine} not supported") + + df.to_parquet( + fn, + engine=self.engine, + storage_options=storage_options + or getattr(self.fs, "storage_options", None), + compression="zstd", + index=False, + **df_backend_kwargs, + ) + + partition.clear() + self._items.pop((field, record)) + + def flush(self, base_url=None, storage_options=None): + """Output any modified or deleted keys + + Parameters + ---------- + base_url: str + Location of the output + """ + + # write what we have so far and clear sub chunks + for thing in list(self._items): + if isinstance(thing, tuple): + field, record = thing + self.write( + field, + record, + base_url=base_url, + storage_options=storage_options, + ) + + # gather .zmetadata from self._items and write that too + for k in list(self._items): + if k != ".zmetadata" and ".z" in k: + self.zmetadata[k] = json.loads(self._items.pop(k)) + met = {"metadata": self.zmetadata, "record_size": self.record_size} + self._items.clear() + self._items[".zmetadata"] = json.dumps(met).encode() + self.fs.pipe( + "/".join([base_url or self.out_root, ".zmetadata"]), + self._items[".zmetadata"], + ) + + # TODO: only clear those that we wrote to? + self.open_refs.cache_clear() + + def __len__(self): + # Caveat: This counts expected references, not actual - but is fast + count = 0 + for field in self.listdir(): + if field.startswith("."): + count += 1 + else: + count += math.prod(self._get_chunk_sizes(field)) + count += len(self.zmetadata) # all metadata keys + # any other files not in reference partitions + count += sum(1 for _ in self._items if not isinstance(_, tuple)) + return count + + def __iter__(self): + # Caveat: returns only existing keys, so the number of these does not + # match len(self) + metas = set(self.zmetadata) + metas.update(self._items) + for bit in metas: + if isinstance(bit, str): + yield bit + for field in self.listdir(): + for k in self._keys_in_field(field): + if k in self: + yield k + + def __contains__(self, item): + try: + self._load_one_key(item) + return True + except KeyError: + return False + + def _keys_in_field(self, field): + """List key names in given field + + Produces strings like "field/x.y" appropriate from the chunking of the array + """ + chunk_sizes = self._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + yield field + "/0" + return + inds = itertools.product(*(range(i) for i in chunk_sizes)) + for ind in inds: + yield field + "/" + ".".join([str(c) for c in ind]) + + +class ReferenceFileSystem(AsyncFileSystem): + """View byte ranges of some other file as a file system + Initial version: single file system target, which must support + async, and must allow start and end args in _cat_file. Later versions + may allow multiple arbitrary URLs for the targets. + This FileSystem is read-only. It is designed to be used with async + targets (for now). We do not get original file details from the target FS. + Configuration is by passing a dict of references at init, or a URL to + a JSON file containing the same; this dict + can also contain concrete data for some set of paths. + Reference dict format: + {path0: bytes_data, path1: (target_url, offset, size)} + https://github.com/fsspec/kerchunk/blob/main/README.md + """ + + protocol = "reference" + cachable = False + + def __init__( + self, + fo, + target=None, + ref_storage_args=None, + target_protocol=None, + target_options=None, + remote_protocol=None, + remote_options=None, + fs=None, + template_overrides=None, + simple_templates=True, + max_gap=64_000, + max_block=256_000_000, + cache_size=128, + **kwargs, + ): + """ + Parameters + ---------- + fo : dict or str + The set of references to use for this instance, with a structure as above. + If str referencing a JSON file, will use fsspec.open, in conjunction + with target_options and target_protocol to open and parse JSON at this + location. If a directory, then assume references are a set of parquet + files to be loaded lazily. + target : str + For any references having target_url as None, this is the default file + target to use + ref_storage_args : dict + If references is a str, use these kwargs for loading the JSON file. + Deprecated: use target_options instead. + target_protocol : str + Used for loading the reference file, if it is a path. If None, protocol + will be derived from the given path + target_options : dict + Extra FS options for loading the reference file ``fo``, if given as a path + remote_protocol : str + The protocol of the filesystem on which the references will be evaluated + (unless fs is provided). If not given, will be derived from the first + URL that has a protocol in the templates or in the references, in that + order. + remote_options : dict + kwargs to go with remote_protocol + fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict)) + Directly provide a file system(s): + - a single filesystem instance + - a dict of protocol:filesystem, where each value is either a filesystem + instance, or a dict of kwargs that can be used to create in + instance for the given protocol + + If this is given, remote_options and remote_protocol are ignored. + template_overrides : dict + Swap out any templates in the references file with these - useful for + testing. + simple_templates: bool + Whether templates can be processed with simple replace (True) or if + jinja is needed (False, much slower). All reference sets produced by + ``kerchunk`` are simple in this sense, but the spec allows for complex. + max_gap, max_block: int + For merging multiple concurrent requests to the same remote file. + Neighboring byte ranges will only be merged when their + inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0 + to only merge when it requires no extra bytes. Pass a negative + number to disable merging, appropriate for local target files. + Neighboring byte ranges will only be merged when the size of + the aggregated range is <= ``max_block``. Default is 256MB. + cache_size : int + Maximum size of LRU cache, where cache_size*record_size denotes + the total number of references that can be loaded in memory at once. + Only used for lazily loaded references. + kwargs : passed to parent class + """ + super().__init__(**kwargs) + self.target = target + self.template_overrides = template_overrides + self.simple_templates = simple_templates + self.templates = {} + self.fss = {} + self._dircache = {} + self.max_gap = max_gap + self.max_block = max_block + if isinstance(fo, str): + dic = dict( + **(ref_storage_args or target_options or {}), protocol=target_protocol + ) + ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic) + if ref_fs.isfile(fo2): + # text JSON + with fsspec.open(fo, "rb", **dic) as f: + logger.info("Read reference from URL %s", fo) + text = json.load(f) + self._process_references(text, template_overrides) + else: + # Lazy parquet refs + logger.info("Open lazy reference dict from URL %s", fo) + self.references = LazyReferenceMapper( + fo2, + fs=ref_fs, + cache_size=cache_size, + ) + else: + # dictionaries + self._process_references(fo, template_overrides) + if isinstance(fs, dict): + self.fss = { + k: ( + fsspec.filesystem(k.split(":", 1)[0], **opts) + if isinstance(opts, dict) + else opts + ) + for k, opts in fs.items() + } + if None not in self.fss: + self.fss[None] = filesystem("file") + return + if fs is not None: + # single remote FS + remote_protocol = ( + fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol + ) + self.fss[remote_protocol] = fs + + if remote_protocol is None: + # get single protocol from any templates + for ref in self.templates.values(): + if callable(ref): + ref = ref() + protocol, _ = fsspec.core.split_protocol(ref) + if protocol and protocol not in self.fss: + fs = filesystem(protocol, **(remote_options or {})) + self.fss[protocol] = fs + if remote_protocol is None: + # get single protocol from references + # TODO: warning here, since this can be very expensive? + for ref in self.references.values(): + if callable(ref): + ref = ref() + if isinstance(ref, list) and ref[0]: + protocol, _ = fsspec.core.split_protocol(ref[0]) + if protocol not in self.fss: + fs = filesystem(protocol, **(remote_options or {})) + self.fss[protocol] = fs + # only use first remote URL + break + + if remote_protocol and remote_protocol not in self.fss: + fs = filesystem(remote_protocol, **(remote_options or {})) + self.fss[remote_protocol] = fs + + self.fss[None] = fs or filesystem("file") # default one + # Wrap any non-async filesystems to ensure async methods are available below + for k, f in self.fss.items(): + if not f.async_impl: + self.fss[k] = AsyncFileSystemWrapper(f) + elif self.asynchronous ^ f.asynchronous: + raise ValueError( + "Reference-FS's target filesystem must have same value" + "of asynchronous" + ) + + def _cat_common(self, path, start=None, end=None): + path = self._strip_protocol(path) + logger.debug(f"cat: {path}") + try: + part = self.references[path] + except KeyError as exc: + raise FileNotFoundError(path) from exc + if isinstance(part, str): + part = part.encode() + if hasattr(part, "to_bytes"): + part = part.to_bytes() + if isinstance(part, bytes): + logger.debug(f"Reference: {path}, type bytes") + if part.startswith(b"base64:"): + part = base64.b64decode(part[7:]) + return part, None, None + + if len(part) == 1: + logger.debug(f"Reference: {path}, whole file => {part}") + url = part[0] + start1, end1 = start, end + else: + url, start0, size = part + logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}") + end0 = start0 + size + + if start is not None: + if start >= 0: + start1 = start0 + start + else: + start1 = end0 + start + else: + start1 = start0 + if end is not None: + if end >= 0: + end1 = start0 + end + else: + end1 = end0 + end + else: + end1 = end0 + if url is None: + url = self.target + return url, start1, end1 + + async def _cat_file(self, path, start=None, end=None, **kwargs): + part_or_url, start0, end0 = self._cat_common(path, start=start, end=end) + if isinstance(part_or_url, bytes): + return part_or_url[start:end] + protocol, _ = split_protocol(part_or_url) + try: + return await self.fss[protocol]._cat_file( + part_or_url, start=start0, end=end0 + ) + except Exception as e: + raise ReferenceNotReachable(path, part_or_url) from e + + def cat_file(self, path, start=None, end=None, **kwargs): + part_or_url, start0, end0 = self._cat_common(path, start=start, end=end) + if isinstance(part_or_url, bytes): + return part_or_url[start:end] + protocol, _ = split_protocol(part_or_url) + try: + return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0) + except Exception as e: + raise ReferenceNotReachable(path, part_or_url) from e + + def pipe_file(self, path, value, **_): + """Temporarily add binary data or reference as a file""" + self.references[path] = value + + async def _get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + return os.makedirs(lpath, exist_ok=True) + data = await self._cat_file(rpath) + with open(lpath, "wb") as f: + f.write(data) + + def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs): + if self.isdir(rpath): + return os.makedirs(lpath, exist_ok=True) + data = self.cat_file(rpath, **kwargs) + callback.set_size(len(data)) + if isfilelike(lpath): + lpath.write(data) + else: + with open(lpath, "wb") as f: + f.write(data) + callback.absolute_update(len(data)) + + def get(self, rpath, lpath, recursive=False, **kwargs): + if recursive: + # trigger directory build + self.ls("") + rpath = self.expand_path(rpath, recursive=recursive) + fs = fsspec.filesystem("file", auto_mkdir=True) + targets = other_paths(rpath, lpath) + if recursive: + data = self.cat([r for r in rpath if not self.isdir(r)]) + else: + data = self.cat(rpath) + for remote, local in zip(rpath, targets): + if remote in data: + fs.pipe_file(local, data[remote]) + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + if isinstance(path, str) and recursive: + raise NotImplementedError + if isinstance(path, list) and (recursive or any("*" in p for p in path)): + raise NotImplementedError + # TODO: if references is lazy, pre-fetch all paths in batch before access + proto_dict = _protocol_groups(path, self.references) + out = {} + for proto, paths in proto_dict.items(): + fs = self.fss[proto] + urls, starts, ends, valid_paths = [], [], [], [] + for p in paths: + # find references or label not-found. Early exit if any not + # found and on_error is "raise" + try: + u, s, e = self._cat_common(p) + if not isinstance(u, (bytes, str)): + # nan/None from parquet + continue + except FileNotFoundError as err: + if on_error == "raise": + raise + if on_error != "omit": + out[p] = err + else: + urls.append(u) + starts.append(s) + ends.append(e) + valid_paths.append(p) + + # process references into form for merging + urls2 = [] + starts2 = [] + ends2 = [] + paths2 = [] + whole_files = set() + for u, s, e, p in zip(urls, starts, ends, valid_paths): + if isinstance(u, bytes): + # data + out[p] = u + elif s is None: + # whole file - limits are None, None, but no further + # entries take for this file + whole_files.add(u) + urls2.append(u) + starts2.append(s) + ends2.append(e) + paths2.append(p) + for u, s, e, p in zip(urls, starts, ends, valid_paths): + # second run to account for files that are to be loaded whole + if s is not None and u not in whole_files: + urls2.append(u) + starts2.append(s) + ends2.append(e) + paths2.append(p) + + # merge and fetch consolidated ranges + new_paths, new_starts, new_ends = merge_offset_ranges( + list(urls2), + list(starts2), + list(ends2), + sort=True, + max_gap=self.max_gap, + max_block=self.max_block, + ) + bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends) + + # unbundle from merged bytes - simple approach + for u, s, e, p in zip(urls, starts, ends, valid_paths): + if p in out: + continue # was bytes, already handled + for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out): + if np == u and (ns is None or ne is None): + if isinstance(b, Exception): + out[p] = b + else: + out[p] = b[s:e] + elif np == u and s >= ns and e <= ne: + if isinstance(b, Exception): + out[p] = b + else: + out[p] = b[s - ns : (e - ne) or None] + + for k, v in out.copy().items(): + # these were valid references, but fetch failed, so transform exc + if isinstance(v, Exception) and k in self.references: + ex = out[k] + new_ex = ReferenceNotReachable(k, self.references[k]) + new_ex.__cause__ = ex + if on_error == "raise": + raise new_ex + elif on_error != "omit": + out[k] = new_ex + + if len(out) == 1 and isinstance(path, str) and "*" not in path: + return _first(out) + return out + + def _process_references(self, references, template_overrides=None): + vers = references.get("version", None) + if vers is None: + self._process_references0(references) + elif vers == 1: + self._process_references1(references, template_overrides=template_overrides) + else: + raise ValueError(f"Unknown reference spec version: {vers}") + # TODO: we make dircache by iterating over all entries, but for Spec >= 1, + # can replace with programmatic. Is it even needed for mapper interface? + + def _process_references0(self, references): + """Make reference dict for Spec Version 0""" + if isinstance(references, dict): + # do not do this for lazy/parquet backend, which will not make dicts, + # but must remain writable in the original object + references = { + key: json.dumps(val) if isinstance(val, dict) else val + for key, val in references.items() + } + self.references = references + + def _process_references1(self, references, template_overrides=None): + if not self.simple_templates or self.templates: + import jinja2 + self.references = {} + self._process_templates(references.get("templates", {})) + + @lru_cache(1000) + def _render_jinja(u): + return jinja2.Template(u).render(**self.templates) + + for k, v in references.get("refs", {}).items(): + if isinstance(v, str): + if v.startswith("base64:"): + self.references[k] = base64.b64decode(v[7:]) + self.references[k] = v + elif isinstance(v, dict): + self.references[k] = json.dumps(v) + elif self.templates: + u = v[0] + if "{{" in u: + if self.simple_templates: + u = ( + u.replace("{{", "{") + .replace("}}", "}") + .format(**self.templates) + ) + else: + u = _render_jinja(u) + self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]] + else: + self.references[k] = v + self.references.update(self._process_gen(references.get("gen", []))) + + def _process_templates(self, tmp): + self.templates = {} + if self.template_overrides is not None: + tmp.update(self.template_overrides) + for k, v in tmp.items(): + if "{{" in v: + import jinja2 + + self.templates[k] = lambda temp=v, **kwargs: jinja2.Template( + temp + ).render(**kwargs) + else: + self.templates[k] = v + + def _process_gen(self, gens): + out = {} + for gen in gens: + dimension = { + k: ( + v + if isinstance(v, list) + else range(v.get("start", 0), v["stop"], v.get("step", 1)) + ) + for k, v in gen["dimensions"].items() + } + products = ( + dict(zip(dimension.keys(), values)) + for values in itertools.product(*dimension.values()) + ) + for pr in products: + import jinja2 + + key = jinja2.Template(gen["key"]).render(**pr, **self.templates) + url = jinja2.Template(gen["url"]).render(**pr, **self.templates) + if ("offset" in gen) and ("length" in gen): + offset = int( + jinja2.Template(gen["offset"]).render(**pr, **self.templates) + ) + length = int( + jinja2.Template(gen["length"]).render(**pr, **self.templates) + ) + out[key] = [url, offset, length] + elif ("offset" in gen) ^ ("length" in gen): + raise ValueError( + "Both 'offset' and 'length' are required for a " + "reference generator entry if either is provided." + ) + else: + out[key] = [url] + return out + + def _dircache_from_items(self): + self.dircache = {"": []} + it = self.references.items() + for path, part in it: + if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"): + size = len(part) + elif len(part) == 1: + size = None + else: + _, _, size = part + par = path.rsplit("/", 1)[0] if "/" in path else "" + par0 = par + subdirs = [par0] + while par0 and par0 not in self.dircache: + # collect parent directories + par0 = self._parent(par0) + subdirs.append(par0) + + subdirs.reverse() + for parent, child in zip(subdirs, subdirs[1:]): + # register newly discovered directories + assert child not in self.dircache + assert parent in self.dircache + self.dircache[parent].append( + {"name": child, "type": "directory", "size": 0} + ) + self.dircache[child] = [] + + self.dircache[par].append({"name": path, "type": "file", "size": size}) + + def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): + part_or_url, start0, end0 = self._cat_common(path) + # This logic is kept outside `ReferenceFile` to avoid unnecessary redirection. + # That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`. + if isinstance(part_or_url, bytes): + return io.BytesIO(part_or_url[start0:end0]) + + protocol, _ = split_protocol(part_or_url) + if start0 is None and end0 is None: + return self.fss[protocol]._open( + part_or_url, + mode, + block_size=block_size, + cache_options=cache_options, + **kwargs, + ) + + return ReferenceFile( + self, + path, + mode, + block_size=block_size, + cache_options=cache_options, + **kwargs, + ) + + def ls(self, path, detail=True, **kwargs): + logger.debug("list %s", path) + path = self._strip_protocol(path) + if isinstance(self.references, LazyReferenceMapper): + try: + return self.references.ls(path, detail) + except KeyError: + pass + raise FileNotFoundError(f"'{path}' is not a known key") + if not self.dircache: + self._dircache_from_items() + out = self._ls_from_cache(path) + if out is None: + raise FileNotFoundError(path) + if detail: + return out + return [o["name"] for o in out] + + def exists(self, path, **kwargs): # overwrite auto-sync version + return self.isdir(path) or self.isfile(path) + + def isdir(self, path): # overwrite auto-sync version + if self.dircache: + return path in self.dircache + elif isinstance(self.references, LazyReferenceMapper): + return path in self.references.listdir() + else: + # this may be faster than building dircache for single calls, but + # by looping will be slow for many calls; could cache it? + return any(_.startswith(f"{path}/") for _ in self.references) + + def isfile(self, path): # overwrite auto-sync version + return path in self.references + + async def _ls(self, path, detail=True, **kwargs): # calls fast sync code + return self.ls(path, detail, **kwargs) + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + if withdirs: + return super().find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs + ) + if path: + path = self._strip_protocol(path) + r = sorted(k for k in self.references if k.startswith(path)) + else: + r = sorted(self.references) + if detail: + if not self.dircache: + self._dircache_from_items() + return {k: self._ls_from_cache(k)[0] for k in r} + else: + return r + + def info(self, path, **kwargs): + out = self.references.get(path) + if out is not None: + if isinstance(out, (str, bytes)): + # decode base64 here + return {"name": path, "type": "file", "size": len(out)} + elif len(out) > 1: + return {"name": path, "type": "file", "size": out[2]} + else: + out0 = [{"name": path, "type": "file", "size": None}] + else: + out = self.ls(path, True) + out0 = [o for o in out if o["name"] == path] + if not out0: + return {"name": path, "type": "directory", "size": 0} + if out0[0]["size"] is None: + # if this is a whole remote file, update size using remote FS + prot, _ = split_protocol(self.references[path][0]) + out0[0]["size"] = self.fss[prot].size(self.references[path][0]) + return out0[0] + + async def _info(self, path, **kwargs): # calls fast sync code + return self.info(path) + + async def _rm_file(self, path, **kwargs): + self.references.pop( + path, None + ) # ignores FileNotFound, just as well for directories + self.dircache.clear() # this is a bit heavy handed + + async def _pipe_file(self, path, data, mode="overwrite", **kwargs): + if mode == "create" and self.exists(path): + raise FileExistsError + # can be str or bytes + self.references[path] = data + self.dircache.clear() # this is a bit heavy handed + + async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs): + # puts binary + if mode == "create" and self.exists(rpath): + raise FileExistsError + with open(lpath, "rb") as f: + self.references[rpath] = f.read() + self.dircache.clear() # this is a bit heavy handed + + def save_json(self, url, **storage_options): + """Write modified references into new location""" + out = {} + for k, v in self.references.items(): + if isinstance(v, bytes): + try: + out[k] = v.decode("ascii") + except UnicodeDecodeError: + out[k] = (b"base64:" + base64.b64encode(v)).decode() + else: + out[k] = v + with fsspec.open(url, "wb", **storage_options) as f: + f.write(json.dumps({"version": 1, "refs": out}).encode()) + + +class ReferenceFile(AbstractBufferedFile): + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + size=None, + **kwargs, + ): + super().__init__( + fs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + size=size, + cache_type=cache_type, + cache_options=cache_options, + **kwargs, + ) + part_or_url, self.start, self.end = self.fs._cat_common(self.path) + protocol, _ = split_protocol(part_or_url) + self.src_fs = self.fs.fss[protocol] + self.src_path = part_or_url + self._f = None + + @property + def f(self): + if self._f is None or self._f.closed: + self._f = self.src_fs._open( + self.src_path, + mode=self.mode, + block_size=self.blocksize, + autocommit=self.autocommit, + cache_type="none", + **self.kwargs, + ) + return self._f + + def close(self): + if self._f is not None: + self._f.close() + return super().close() + + def _fetch_range(self, start, end): + start = start + self.start + end = min(end + self.start, self.end) + self.f.seek(start) + return self.f.read(end - start) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py new file mode 100644 index 0000000000000000000000000000000000000000..77f7b370cd246f9a9bfd34141afc3edd728d13c3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py @@ -0,0 +1,180 @@ +import datetime +import logging +import os +import types +import uuid +from stat import S_ISDIR, S_ISLNK + +import paramiko + +from .. import AbstractFileSystem +from ..utils import infer_storage_options + +logger = logging.getLogger("fsspec.sftp") + + +class SFTPFileSystem(AbstractFileSystem): + """Files over SFTP/SSH + + Peer-to-peer filesystem over SSH using paramiko. + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. + """ + + protocol = "sftp", "ssh" + + def __init__(self, host, **ssh_kwargs): + """ + + Parameters + ---------- + host: str + Hostname or IP as a string + temppath: str + Location on the server to put files, when within a transaction + ssh_kwargs: dict + Parameters passed on to connection. See details in + https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect + May include port, username, password... + """ + if self._cached: + return + super().__init__(**ssh_kwargs) + self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory + self.host = host + self.ssh_kwargs = ssh_kwargs + self._connect() + + def _connect(self): + logger.debug("Connecting to SFTP server %s", self.host) + self.client = paramiko.SSHClient() + self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.client.connect(self.host, **self.ssh_kwargs) + self.ftp = self.client.open_sftp() + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + def mkdir(self, path, create_parents=True, mode=511): + logger.debug("Creating folder %s", path) + if self.exists(path): + raise FileExistsError(f"File exists: {path}") + + if create_parents: + self.makedirs(path) + else: + self.ftp.mkdir(path, mode) + + def makedirs(self, path, exist_ok=False, mode=511): + if self.exists(path) and not exist_ok: + raise FileExistsError(f"File exists: {path}") + + parts = path.split("/") + new_path = "/" if path[:1] == "/" else "" + + for part in parts: + if part: + new_path = f"{new_path}/{part}" if new_path else part + if not self.exists(new_path): + self.ftp.mkdir(new_path, mode) + + def rmdir(self, path): + logger.debug("Removing folder %s", path) + self.ftp.rmdir(path) + + def info(self, path): + stat = self._decode_stat(self.ftp.stat(path)) + stat["name"] = path + return stat + + @staticmethod + def _decode_stat(stat, parent_path=None): + if S_ISDIR(stat.st_mode): + t = "directory" + elif S_ISLNK(stat.st_mode): + t = "link" + else: + t = "file" + out = { + "name": "", + "size": stat.st_size, + "type": t, + "uid": stat.st_uid, + "gid": stat.st_gid, + "time": datetime.datetime.fromtimestamp( + stat.st_atime, tz=datetime.timezone.utc + ), + "mtime": datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ), + } + if parent_path: + out["name"] = "/".join([parent_path.rstrip("/"), stat.filename]) + return out + + def ls(self, path, detail=False): + logger.debug("Listing folder %s", path) + stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)] + if detail: + return stats + else: + paths = [stat["name"] for stat in stats] + return sorted(paths) + + def put(self, lpath, rpath, callback=None, **kwargs): + logger.debug("Put file %s into %s", lpath, rpath) + self.ftp.put(lpath, rpath) + + def get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) + else: + self.ftp.get(self._strip_protocol(rpath), lpath) + + def _open(self, path, mode="rb", block_size=None, **kwargs): + """ + block_size: int or None + If 0, no buffering, if 1, line buffering, if >1, buffer that many + bytes, if None use default from paramiko. + """ + logger.debug("Opening file %s", path) + if kwargs.get("autocommit", True) is False: + # writes to temporary file, move on commit + path2 = "/".join([self.temppath, str(uuid.uuid4())]) + f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) + f.temppath = path2 + f.targetpath = path + f.fs = self + f.commit = types.MethodType(commit_a_file, f) + f.discard = types.MethodType(discard_a_file, f) + else: + f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) + return f + + def _rm(self, path): + if self.isdir(path): + self.ftp.rmdir(path) + else: + self.ftp.remove(path) + + def mv(self, old, new): + logger.debug("Renaming %s into %s", old, new) + self.ftp.posix_rename(old, new) + + +def commit_a_file(self): + self.fs.mv(self.temppath, self.targetpath) + + +def discard_a_file(self): + self.fs._rm(self.temppath) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py new file mode 100644 index 0000000000000000000000000000000000000000..412e5ba4d2cdea7db090dc96412e697909a38d78 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py @@ -0,0 +1,124 @@ +import logging +import tarfile + +import fsspec +from fsspec.archive import AbstractArchiveFileSystem +from fsspec.compression import compr +from fsspec.utils import infer_compression + +typemap = {b"0": "file", b"5": "directory"} + +logger = logging.getLogger("tar") + + +class TarFileSystem(AbstractArchiveFileSystem): + """Compressed Tar archives as a file-system (read-only) + + Supports the following formats: + tar.gz, tar.bz2, tar.xz + """ + + root_marker = "" + protocol = "tar" + cachable = False + + def __init__( + self, + fo="", + index_store=None, + target_options=None, + target_protocol=None, + compression=None, + **kwargs, + ): + super().__init__(**kwargs) + target_options = target_options or {} + + if isinstance(fo, str): + self.of = fsspec.open(fo, protocol=target_protocol, **target_options) + fo = self.of.open() # keep the reference + + # Try to infer compression. + if compression is None: + name = None + + # Try different ways to get hold of the filename. `fo` might either + # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an + # `fsspec.AbstractFileSystem` instance. + try: + # Amended io.BufferedReader or similar. + # This uses a "protocol extension" where original filenames are + # propagated to archive-like filesystems in order to let them + # infer the right compression appropriately. + if hasattr(fo, "original"): + name = fo.original + + # fsspec.LocalFileOpener + elif hasattr(fo, "path"): + name = fo.path + + # io.BufferedReader + elif hasattr(fo, "name"): + name = fo.name + + # fsspec.AbstractFileSystem + elif hasattr(fo, "info"): + name = fo.info()["name"] + + except Exception as ex: + logger.warning( + f"Unable to determine file name, not inferring compression: {ex}" + ) + + if name is not None: + compression = infer_compression(name) + logger.info(f"Inferred compression {compression} from file name {name}") + + if compression is not None: + # TODO: tarfile already implements compression with modes like "'r:gz'", + # but then would seek to offset in the file work? + fo = compr[compression](fo) + + self._fo_ref = fo + self.fo = fo # the whole instance is a context + self.tar = tarfile.TarFile(fileobj=self.fo) + self.dir_cache = None + + self.index_store = index_store + self.index = None + self._index() + + def _index(self): + # TODO: load and set saved index, if exists + out = {} + for ti in self.tar: + info = ti.get_info() + info["type"] = typemap.get(info["type"], "file") + name = ti.get_info()["name"].rstrip("/") + out[name] = (info, ti.offset_data) + + self.index = out + # TODO: save index to self.index_store here, if set + + def _get_dirs(self): + if self.dir_cache is not None: + return + + # This enables ls to get directories as children as well as files + self.dir_cache = { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(self.tar.getnames()) + } + for member in self.tar.getmembers(): + info = member.get_info() + info["name"] = info["name"].rstrip("/") + info["type"] = typemap.get(info["type"], "file") + self.dir_cache[info["name"]] = info + + def _open(self, path, mode="rb", **kwargs): + if mode != "rb": + raise ValueError("Read-only filesystem implementation") + details, offset = self.index[path] + if details["type"] != "file": + raise ValueError("Can only handle regular files") + return self.tar.extractfile(path) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py new file mode 100644 index 0000000000000000000000000000000000000000..c6e0d8446f875ec9578cccf055aeb47cd1c2e996 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py @@ -0,0 +1,485 @@ +# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html + +import logging +import os +import secrets +import shutil +import tempfile +import uuid +from contextlib import suppress +from urllib.parse import quote + +import requests + +from ..spec import AbstractBufferedFile, AbstractFileSystem +from ..utils import infer_storage_options, tokenize + +logger = logging.getLogger("webhdfs") + + +class WebHDFS(AbstractFileSystem): + """ + Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways. + + Four auth mechanisms are supported: + + insecure: no auth is done, and the user is assumed to be whoever they + say they are (parameter ``user``), or a predefined value such as + "dr.who" if not given + spnego: when kerberos authentication is enabled, auth is negotiated by + requests_kerberos https://github.com/requests/requests-kerberos . + This establishes a session based on existing kinit login and/or + specified principal/password; parameters are passed with ``kerb_kwargs`` + token: uses an existing Hadoop delegation token from another secured + service. Indeed, this client can also generate such tokens when + not insecure. Note that tokens expire, but can be renewed (by a + previously specified user) and may allow for proxying. + basic-auth: used when both parameter ``user`` and parameter ``password`` + are provided. + + """ + + tempdir = str(tempfile.gettempdir()) + protocol = "webhdfs", "webHDFS" + + def __init__( + self, + host, + port=50070, + kerberos=False, + token=None, + user=None, + password=None, + proxy_to=None, + kerb_kwargs=None, + data_proxy=None, + use_https=False, + session_cert=None, + session_verify=True, + **kwargs, + ): + """ + Parameters + ---------- + host: str + Name-node address + port: int + Port for webHDFS + kerberos: bool + Whether to authenticate with kerberos for this connection + token: str or None + If given, use this token on every call to authenticate. A user + and user-proxy may be encoded in the token and should not be also + given + user: str or None + If given, assert the user name to connect with + password: str or None + If given, assert the password to use for basic auth. If password + is provided, user must be provided also + proxy_to: str or None + If given, the user has the authority to proxy, and this value is + the user in who's name actions are taken + kerb_kwargs: dict + Any extra arguments for HTTPKerberosAuth, see + ``_ + data_proxy: dict, callable or None + If given, map data-node addresses. This can be necessary if the + HDFS cluster is behind a proxy, running on Docker or otherwise has + a mismatch between the host-names given by the name-node and the + address by which to refer to them from the client. If a dict, + maps host names ``host->data_proxy[host]``; if a callable, full + URLs are passed, and function must conform to + ``url->data_proxy(url)``. + use_https: bool + Whether to connect to the Name-node using HTTPS instead of HTTP + session_cert: str or Tuple[str, str] or None + Path to a certificate file, or tuple of (cert, key) files to use + for the requests.Session + session_verify: str, bool or None + Path to a certificate file to use for verifying the requests.Session. + kwargs + """ + if self._cached: + return + super().__init__(**kwargs) + self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" + self.kerb = kerberos + self.kerb_kwargs = kerb_kwargs or {} + self.pars = {} + self.proxy = data_proxy or {} + if token is not None: + if user is not None or proxy_to is not None: + raise ValueError( + "If passing a delegation token, must not set " + "user or proxy_to, as these are encoded in the" + " token" + ) + self.pars["delegation"] = token + self.user = user + self.password = password + + if password is not None: + if user is None: + raise ValueError( + "If passing a password, the user must also be" + "set in order to set up the basic-auth" + ) + else: + if user is not None: + self.pars["user.name"] = user + + if proxy_to is not None: + self.pars["doas"] = proxy_to + if kerberos and user is not None: + raise ValueError( + "If using Kerberos auth, do not specify the " + "user, this is handled by kinit." + ) + + self.session_cert = session_cert + self.session_verify = session_verify + + self._connect() + + self._fsid = f"webhdfs_{tokenize(host, port)}" + + @property + def fsid(self): + return self._fsid + + def _connect(self): + self.session = requests.Session() + + if self.session_cert: + self.session.cert = self.session_cert + + self.session.verify = self.session_verify + + if self.kerb: + from requests_kerberos import HTTPKerberosAuth + + self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) + + if self.user is not None and self.password is not None: + from requests.auth import HTTPBasicAuth + + self.session.auth = HTTPBasicAuth(self.user, self.password) + + def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): + path = self._strip_protocol(path) if path is not None else "" + url = self._apply_proxy(self.url + quote(path, safe="/=")) + args = kwargs.copy() + args.update(self.pars) + args["op"] = op.upper() + logger.debug("sending %s with %s", url, method) + out = self.session.request( + method=method.upper(), + url=url, + params=args, + data=data, + allow_redirects=redirect, + ) + if out.status_code in [400, 401, 403, 404, 500]: + try: + err = out.json() + msg = err["RemoteException"]["message"] + exp = err["RemoteException"]["exception"] + except (ValueError, KeyError): + pass + else: + if exp in ["IllegalArgumentException", "UnsupportedOperationException"]: + raise ValueError(msg) + elif exp in ["SecurityException", "AccessControlException"]: + raise PermissionError(msg) + elif exp in ["FileNotFoundException"]: + raise FileNotFoundError(msg) + else: + raise RuntimeError(msg) + out.raise_for_status() + return out + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + replication=None, + permissions=None, + **kwargs, + ): + """ + + Parameters + ---------- + path: str + File location + mode: str + 'rb', 'wb', etc. + block_size: int + Client buffer size for read-ahead or write buffer + autocommit: bool + If False, writes to temporary file that only gets put in final + location upon commit + replication: int + Number of copies of file on the cluster, write mode only + permissions: str or int + posix permissions, write mode only + kwargs + + Returns + ------- + WebHDFile instance + """ + block_size = block_size or self.blocksize + return WebHDFile( + self, + path, + mode=mode, + block_size=block_size, + tempdir=self.tempdir, + autocommit=autocommit, + replication=replication, + permissions=permissions, + ) + + @staticmethod + def _process_info(info): + info["type"] = info["type"].lower() + info["size"] = info["length"] + return info + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + if "username" in out: + out["user"] = out.pop("username") + return out + + def info(self, path): + out = self._call("GETFILESTATUS", path=path) + info = out.json()["FileStatus"] + info["name"] = path + return self._process_info(info) + + def ls(self, path, detail=False): + out = self._call("LISTSTATUS", path=path) + infos = out.json()["FileStatuses"]["FileStatus"] + for info in infos: + self._process_info(info) + info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] + if detail: + return sorted(infos, key=lambda i: i["name"]) + else: + return sorted(info["name"] for info in infos) + + def content_summary(self, path): + """Total numbers of files, directories and bytes under path""" + out = self._call("GETCONTENTSUMMARY", path=path) + return out.json()["ContentSummary"] + + def ukey(self, path): + """Checksum info of file, giving method and result""" + out = self._call("GETFILECHECKSUM", path=path, redirect=False) + if "Location" in out.headers: + location = self._apply_proxy(out.headers["Location"]) + out2 = self.session.get(location) + out2.raise_for_status() + return out2.json()["FileChecksum"] + else: + out.raise_for_status() + return out.json()["FileChecksum"] + + def home_directory(self): + """Get user's home directory""" + out = self._call("GETHOMEDIRECTORY") + return out.json()["Path"] + + def get_delegation_token(self, renewer=None): + """Retrieve token which can give the same authority to other uses + + Parameters + ---------- + renewer: str or None + User who may use this token; if None, will be current user + """ + if renewer: + out = self._call("GETDELEGATIONTOKEN", renewer=renewer) + else: + out = self._call("GETDELEGATIONTOKEN") + t = out.json()["Token"] + if t is None: + raise ValueError("No token available for this user/security context") + return t["urlString"] + + def renew_delegation_token(self, token): + """Make token live longer. Returns new expiry time""" + out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) + return out.json()["long"] + + def cancel_delegation_token(self, token): + """Stop the token from being useful""" + self._call("CANCELDELEGATIONTOKEN", method="put", token=token) + + def chmod(self, path, mod): + """Set the permission at path + + Parameters + ---------- + path: str + location to set (file or directory) + mod: str or int + posix epresentation or permission, give as oct string, e.g, '777' + or 0o777 + """ + self._call("SETPERMISSION", method="put", path=path, permission=mod) + + def chown(self, path, owner=None, group=None): + """Change owning user and/or group""" + kwargs = {} + if owner is not None: + kwargs["owner"] = owner + if group is not None: + kwargs["group"] = group + self._call("SETOWNER", method="put", path=path, **kwargs) + + def set_replication(self, path, replication): + """ + Set file replication factor + + Parameters + ---------- + path: str + File location (not for directories) + replication: int + Number of copies of file on the cluster. Should be smaller than + number of data nodes; normally 3 on most systems. + """ + self._call("SETREPLICATION", path=path, method="put", replication=replication) + + def mkdir(self, path, **kwargs): + self._call("MKDIRS", method="put", path=path) + + def makedirs(self, path, exist_ok=False): + if exist_ok is False and self.exists(path): + raise FileExistsError(path) + self.mkdir(path) + + def mv(self, path1, path2, **kwargs): + self._call("RENAME", method="put", path=path1, destination=path2) + + def rm(self, path, recursive=False, **kwargs): + self._call( + "DELETE", + method="delete", + path=path, + recursive="true" if recursive else "false", + ) + + def rm_file(self, path, **kwargs): + self.rm(path) + + def cp_file(self, lpath, rpath, **kwargs): + with self.open(lpath) as lstream: + tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"]) + # Perform an atomic copy (stream to a temporary file and + # move it to the actual destination). + try: + with self.open(tmp_fname, "wb") as rstream: + shutil.copyfileobj(lstream, rstream) + self.mv(tmp_fname, rpath) + except BaseException: + with suppress(FileNotFoundError): + self.rm(tmp_fname) + raise + + def _apply_proxy(self, location): + if self.proxy and callable(self.proxy): + location = self.proxy(location) + elif self.proxy: + # as a dict + for k, v in self.proxy.items(): + location = location.replace(k, v, 1) + return location + + +class WebHDFile(AbstractBufferedFile): + """A file living in HDFS over webHDFS""" + + def __init__(self, fs, path, **kwargs): + super().__init__(fs, path, **kwargs) + kwargs = kwargs.copy() + if kwargs.get("permissions", None) is None: + kwargs.pop("permissions", None) + if kwargs.get("replication", None) is None: + kwargs.pop("replication", None) + self.permissions = kwargs.pop("permissions", 511) + tempdir = kwargs.pop("tempdir") + if kwargs.pop("autocommit", False) is False: + self.target = self.path + self.path = os.path.join(tempdir, str(uuid.uuid4())) + + def _upload_chunk(self, final=False): + """Write one part of a multi-block file upload + + Parameters + ========== + final: bool + This is the last block, so should complete file, if + self.autocommit is True. + """ + out = self.fs.session.post( + self.location, + data=self.buffer.getvalue(), + headers={"content-type": "application/octet-stream"}, + ) + out.raise_for_status() + return True + + def _initiate_upload(self): + """Create remote file/upload""" + kwargs = self.kwargs.copy() + if "a" in self.mode: + op, method = "APPEND", "POST" + else: + op, method = "CREATE", "PUT" + kwargs["overwrite"] = "true" + out = self.fs._call(op, method, self.path, redirect=False, **kwargs) + location = self.fs._apply_proxy(out.headers["Location"]) + if "w" in self.mode: + # create empty file to append to + out2 = self.fs.session.put( + location, headers={"content-type": "application/octet-stream"} + ) + out2.raise_for_status() + # after creating empty file, change location to append to + out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs) + self.location = self.fs._apply_proxy(out2.headers["Location"]) + + def _fetch_range(self, start, end): + start = max(start, 0) + end = min(self.size, end) + if start >= end or start >= self.size: + return b"" + out = self.fs._call( + "OPEN", path=self.path, offset=start, length=end - start, redirect=False + ) + out.raise_for_status() + if "Location" in out.headers: + location = out.headers["Location"] + out2 = self.fs.session.get(self.fs._apply_proxy(location)) + return out2.content + else: + return out.content + + def commit(self): + self.fs.mv(self.path, self.target) + + def discard(self): + self.fs.rm(self.path) diff --git a/.venv/lib/python3.11/site-packages/fsspec/json.py b/.venv/lib/python3.11/site-packages/fsspec/json.py new file mode 100644 index 0000000000000000000000000000000000000000..69cead04509a1ebc9175ec5ad449c8a866b60444 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/json.py @@ -0,0 +1,121 @@ +import json +from contextlib import suppress +from pathlib import PurePath +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, +) + +from .registry import _import_class, get_filesystem_class +from .spec import AbstractFileSystem + + +class FilesystemJSONEncoder(json.JSONEncoder): + include_password: ClassVar[bool] = True + + def default(self, o: Any) -> Any: + if isinstance(o, AbstractFileSystem): + return o.to_dict(include_password=self.include_password) + if isinstance(o, PurePath): + cls = type(o) + return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)} + + return super().default(o) + + def make_serializable(self, obj: Any) -> Any: + """ + Recursively converts an object so that it can be JSON serialized via + :func:`json.dumps` and :func:`json.dump`, without actually calling + said functions. + """ + if isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, Mapping): + return {k: self.make_serializable(v) for k, v in obj.items()} + if isinstance(obj, Sequence): + return [self.make_serializable(v) for v in obj] + + return self.default(obj) + + +class FilesystemJSONDecoder(json.JSONDecoder): + def __init__( + self, + *, + object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None, + parse_float: Optional[Callable[[str], Any]] = None, + parse_int: Optional[Callable[[str], Any]] = None, + parse_constant: Optional[Callable[[str], Any]] = None, + strict: bool = True, + object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None, + ) -> None: + self.original_object_hook = object_hook + + super().__init__( + object_hook=self.custom_object_hook, + parse_float=parse_float, + parse_int=parse_int, + parse_constant=parse_constant, + strict=strict, + object_pairs_hook=object_pairs_hook, + ) + + @classmethod + def try_resolve_path_cls(cls, dct: Dict[str, Any]): + with suppress(Exception): + fqp = dct["cls"] + + path_cls = _import_class(fqp) + + if issubclass(path_cls, PurePath): + return path_cls + + return None + + @classmethod + def try_resolve_fs_cls(cls, dct: Dict[str, Any]): + with suppress(Exception): + if "cls" in dct: + try: + fs_cls = _import_class(dct["cls"]) + if issubclass(fs_cls, AbstractFileSystem): + return fs_cls + except Exception: + if "protocol" in dct: # Fallback if cls cannot be imported + return get_filesystem_class(dct["protocol"]) + + raise + + return None + + def custom_object_hook(self, dct: Dict[str, Any]): + if "cls" in dct: + if (obj_cls := self.try_resolve_fs_cls(dct)) is not None: + return AbstractFileSystem.from_dict(dct) + if (obj_cls := self.try_resolve_path_cls(dct)) is not None: + return obj_cls(dct["str"]) + + if self.original_object_hook is not None: + return self.original_object_hook(dct) + + return dct + + def unmake_serializable(self, obj: Any) -> Any: + """ + Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`. + """ + if isinstance(obj, dict): + obj = self.custom_object_hook(obj) + if isinstance(obj, dict): + return {k: self.unmake_serializable(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [self.unmake_serializable(v) for v in obj] + + return obj diff --git a/.venv/lib/python3.11/site-packages/fsspec/mapping.py b/.venv/lib/python3.11/site-packages/fsspec/mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..752eef35273b13eded7297e2e801b58e436a25b1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/mapping.py @@ -0,0 +1,251 @@ +import array +import logging +import posixpath +import warnings +from collections.abc import MutableMapping +from functools import cached_property + +from fsspec.core import url_to_fs + +logger = logging.getLogger("fsspec.mapping") + + +class FSMap(MutableMapping): + """Wrap a FileSystem instance as a mutable wrapping. + + The keys of the mapping become files under the given root, and the + values (which must be bytes) the contents of those files. + + Parameters + ---------- + root: string + prefix for all the files + fs: FileSystem instance + check: bool (=True) + performs a touch at the location, to check for write access. + + Examples + -------- + >>> fs = FileSystem(**parameters) # doctest: +SKIP + >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP + or, more likely + >>> d = fs.get_mapper('my-data/path/') + + >>> d['loc1'] = b'Hello World' # doctest: +SKIP + >>> list(d.keys()) # doctest: +SKIP + ['loc1'] + >>> d['loc1'] # doctest: +SKIP + b'Hello World' + """ + + def __init__(self, root, fs, check=False, create=False, missing_exceptions=None): + self.fs = fs + self.root = fs._strip_protocol(root) + self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1] + if missing_exceptions is None: + missing_exceptions = ( + FileNotFoundError, + IsADirectoryError, + NotADirectoryError, + ) + self.missing_exceptions = missing_exceptions + self.check = check + self.create = create + if create: + if not self.fs.exists(root): + self.fs.mkdir(root) + if check: + if not self.fs.exists(root): + raise ValueError( + f"Path {root} does not exist. Create " + f" with the ``create=True`` keyword" + ) + self.fs.touch(root + "/a") + self.fs.rm(root + "/a") + + @cached_property + def dirfs(self): + """dirfs instance that can be used with the same keys as the mapper""" + from .implementations.dirfs import DirFileSystem + + return DirFileSystem(path=self._root_key_to_str, fs=self.fs) + + def clear(self): + """Remove all keys below root - empties out mapping""" + logger.info("Clear mapping at %s", self.root) + try: + self.fs.rm(self.root, True) + self.fs.mkdir(self.root) + except: # noqa: E722 + pass + + def getitems(self, keys, on_error="raise"): + """Fetch multiple items from the store + + If the backend is async-able, this might proceed concurrently + + Parameters + ---------- + keys: list(str) + They keys to be fetched + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + + Returns + ------- + dict(key, bytes|exception) + """ + keys2 = [self._key_to_str(k) for k in keys] + oe = on_error if on_error == "raise" else "return" + try: + out = self.fs.cat(keys2, on_error=oe) + if isinstance(out, bytes): + out = {keys2[0]: out} + except self.missing_exceptions as e: + raise KeyError from e + out = { + k: (KeyError() if isinstance(v, self.missing_exceptions) else v) + for k, v in out.items() + } + return { + key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2)) + for key, k2 in zip(keys, keys2) + if on_error == "return" or not isinstance(out[k2], BaseException) + } + + def setitems(self, values_dict): + """Set the values of multiple items in the store + + Parameters + ---------- + values_dict: dict(str, bytes) + """ + values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()} + self.fs.pipe(values) + + def delitems(self, keys): + """Remove multiple keys from the store""" + self.fs.rm([self._key_to_str(k) for k in keys]) + + def _key_to_str(self, key): + """Generate full path for the key""" + if not isinstance(key, str): + # raise TypeError("key must be of type `str`, got `{type(key).__name__}`" + warnings.warn( + "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError", + DeprecationWarning, + ) + if isinstance(key, list): + key = tuple(key) + key = str(key) + return f"{self._root_key_to_str}{key}".rstrip("/") + + def _str_to_key(self, s): + """Strip path of to leave key name""" + return s[len(self.root) :].lstrip("/") + + def __getitem__(self, key, default=None): + """Retrieve data""" + k = self._key_to_str(key) + try: + result = self.fs.cat(k) + except self.missing_exceptions as exc: + if default is not None: + return default + raise KeyError(key) from exc + return result + + def pop(self, key, default=None): + """Pop data""" + result = self.__getitem__(key, default) + try: + del self[key] + except KeyError: + pass + return result + + def __setitem__(self, key, value): + """Store value in key""" + key = self._key_to_str(key) + self.fs.mkdirs(self.fs._parent(key), exist_ok=True) + self.fs.pipe_file(key, maybe_convert(value)) + + def __iter__(self): + return (self._str_to_key(x) for x in self.fs.find(self.root)) + + def __len__(self): + return len(self.fs.find(self.root)) + + def __delitem__(self, key): + """Remove key""" + try: + self.fs.rm(self._key_to_str(key)) + except Exception as exc: + raise KeyError from exc + + def __contains__(self, key): + """Does key exist in mapping?""" + path = self._key_to_str(key) + return self.fs.isfile(path) + + def __reduce__(self): + return FSMap, (self.root, self.fs, False, False, self.missing_exceptions) + + +def maybe_convert(value): + if isinstance(value, array.array) or hasattr(value, "__array__"): + # bytes-like things + if hasattr(value, "dtype") and value.dtype.kind in "Mm": + # The buffer interface doesn't support datetime64/timdelta64 numpy + # arrays + value = value.view("int64") + value = bytes(memoryview(value)) + return value + + +def get_mapper( + url="", + check=False, + create=False, + missing_exceptions=None, + alternate_root=None, + **kwargs, +): + """Create key-value interface for given URL and options + + The URL will be of the form "protocol://location" and point to the root + of the mapper required. All keys will be file-names below this location, + and their values the contents of each key. + + Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``. + + Parameters + ---------- + url: str + Root URL of mapping + check: bool + Whether to attempt to read from the location before instantiation, to + check that the mapping does exist + create: bool + Whether to make the directory corresponding to the root before + instantiating + missing_exceptions: None or tuple + If given, these exception types will be regarded as missing keys and + return KeyError when trying to read data. By default, you get + (FileNotFoundError, IsADirectoryError, NotADirectoryError) + alternate_root: None or str + In cases of complex URLs, the parser may fail to pick the correct part + for the mapper root, so this arg can override + + Returns + ------- + ``FSMap`` instance, the dict-like key-value store. + """ + # Removing protocol here - could defer to each open() on the backend + fs, urlpath = url_to_fs(url, **kwargs) + root = alternate_root if alternate_root is not None else urlpath + return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions) diff --git a/.venv/lib/python3.11/site-packages/fsspec/parquet.py b/.venv/lib/python3.11/site-packages/fsspec/parquet.py new file mode 100644 index 0000000000000000000000000000000000000000..faedb7b9e0aa90b6fb7cba33e794c4b4fb35eb77 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/parquet.py @@ -0,0 +1,541 @@ +import io +import json +import warnings + +from .core import url_to_fs +from .utils import merge_offset_ranges + +# Parquet-Specific Utilities for fsspec +# +# Most of the functions defined in this module are NOT +# intended for public consumption. The only exception +# to this is `open_parquet_file`, which should be used +# place of `fs.open()` to open parquet-formatted files +# on remote file systems. + + +def open_parquet_file( + path, + mode="rb", + fs=None, + metadata=None, + columns=None, + row_groups=None, + storage_options=None, + strict=False, + engine="auto", + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, + **kwargs, +): + """ + Return a file-like object for a single Parquet file. + + The specified parquet `engine` will be used to parse the + footer metadata, and determine the required byte ranges + from the file. The target path will then be opened with + the "parts" (`KnownPartsOfAFile`) caching strategy. + + Note that this method is intended for usage with remote + file systems, and is unlikely to improve parquet-read + performance on local file systems. + + Parameters + ---------- + path: str + Target file path. + mode: str, optional + Mode option to be passed through to `fs.open`. Default is "rb". + metadata: Any, optional + Parquet metadata object. Object type must be supported + by the backend parquet engine. For now, only the "fastparquet" + engine supports an explicit `ParquetFile` metadata object. + If a metadata object is supplied, the remote footer metadata + will not need to be transferred into local memory. + fs: AbstractFileSystem, optional + Filesystem object to use for opening the file. If nothing is + specified, an `AbstractFileSystem` object will be inferred. + engine : str, default "auto" + Parquet engine to use for metadata parsing. Allowed options + include "fastparquet", "pyarrow", and "auto". The specified + engine must be installed in the current environment. If + "auto" is specified, and both engines are installed, + "fastparquet" will take precedence over "pyarrow". + columns: list, optional + List of all column names that may be read from the file. + row_groups : list, optional + List of all row-groups that may be read from the file. This + may be a list of row-group indices (integers), or it may be + a list of `RowGroup` metadata objects (if the "fastparquet" + engine is used). + storage_options : dict, optional + Used to generate an `AbstractFileSystem` object if `fs` was + not specified. + strict : bool, optional + Whether the resulting `KnownPartsOfAFile` cache should + fetch reads that go beyond a known byte-range boundary. + If `False` (the default), any read that ends outside a + known part will be zero padded. Note that using + `strict=True` may be useful for debugging. + max_gap : int, optional + Neighboring byte ranges will only be merged when their + inter-range gap is <= `max_gap`. Default is 64KB. + max_block : int, optional + Neighboring byte ranges will only be merged when the size of + the aggregated range is <= `max_block`. Default is 256MB. + footer_sample_size : int, optional + Number of bytes to read from the end of the path to look + for the footer metadata. If the sampled bytes do not contain + the footer, a second read request will be required, and + performance will suffer. Default is 1MB. + **kwargs : + Optional key-word arguments to pass to `fs.open` + """ + + # Make sure we have an `AbstractFileSystem` object + # to work with + if fs is None: + fs = url_to_fs(path, **(storage_options or {}))[0] + + # For now, `columns == []` not supported. Just use + # default `open` command with `path` input + if columns is not None and len(columns) == 0: + return fs.open(path, mode=mode) + + # Set the engine + engine = _set_engine(engine) + + # Fetch the known byte ranges needed to read + # `columns` and/or `row_groups` + data = _get_parquet_byte_ranges( + [path], + fs, + metadata=metadata, + columns=columns, + row_groups=row_groups, + engine=engine, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + ) + + # Extract file name from `data` + fn = next(iter(data)) if data else path + + # Call self.open with "parts" caching + options = kwargs.pop("cache_options", {}).copy() + return fs.open( + fn, + mode=mode, + cache_type="parts", + cache_options={ + **options, + "data": data.get(fn, {}), + "strict": strict, + }, + **kwargs, + ) + + +def _get_parquet_byte_ranges( + paths, + fs, + metadata=None, + columns=None, + row_groups=None, + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, + engine="auto", +): + """Get a dictionary of the known byte ranges needed + to read a specific column/row-group selection from a + Parquet dataset. Each value in the output dictionary + is intended for use as the `data` argument for the + `KnownPartsOfAFile` caching strategy of a single path. + """ + + # Set engine if necessary + if isinstance(engine, str): + engine = _set_engine(engine) + + # Pass to specialized function if metadata is defined + if metadata is not None: + # Use the provided parquet metadata object + # to avoid transferring/parsing footer metadata + return _get_parquet_byte_ranges_from_metadata( + metadata, + fs, + engine, + columns=columns, + row_groups=row_groups, + max_gap=max_gap, + max_block=max_block, + ) + + # Get file sizes asynchronously + file_sizes = fs.sizes(paths) + + # Populate global paths, starts, & ends + result = {} + data_paths = [] + data_starts = [] + data_ends = [] + add_header_magic = True + if columns is None and row_groups is None: + # We are NOT selecting specific columns or row-groups. + # + # We can avoid sampling the footers, and just transfer + # all file data with cat_ranges + for i, path in enumerate(paths): + result[path] = {} + for b in range(0, file_sizes[i], max_block): + data_paths.append(path) + data_starts.append(b) + data_ends.append(min(b + max_block, file_sizes[i])) + add_header_magic = False # "Magic" should already be included + else: + # We ARE selecting specific columns or row-groups. + # + # Gather file footers. + # We just take the last `footer_sample_size` bytes of each + # file (or the entire file if it is smaller than that) + footer_starts = [] + footer_ends = [] + for i, path in enumerate(paths): + footer_ends.append(file_sizes[i]) + sample_size = max(0, file_sizes[i] - footer_sample_size) + footer_starts.append(sample_size) + footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends) + + # Check our footer samples and re-sample if necessary. + missing_footer_starts = footer_starts.copy() + large_footer = 0 + for i, path in enumerate(paths): + footer_size = int.from_bytes(footer_samples[i][-8:-4], "little") + real_footer_start = file_sizes[i] - (footer_size + 8) + if real_footer_start < footer_starts[i]: + missing_footer_starts[i] = real_footer_start + large_footer = max(large_footer, (footer_size + 8)) + if large_footer: + warnings.warn( + f"Not enough data was used to sample the parquet footer. " + f"Try setting footer_sample_size >= {large_footer}." + ) + for i, block in enumerate( + fs.cat_ranges( + paths, + missing_footer_starts, + footer_starts, + ) + ): + footer_samples[i] = block + footer_samples[i] + footer_starts[i] = missing_footer_starts[i] + + # Calculate required byte ranges for each path + for i, path in enumerate(paths): + # Deal with small-file case. + # Just include all remaining bytes of the file + # in a single range. + if file_sizes[i] < max_block: + if footer_starts[i] > 0: + # Only need to transfer the data if the + # footer sample isn't already the whole file + data_paths.append(path) + data_starts.append(0) + data_ends.append(footer_starts[i]) + continue + + # Use "engine" to collect data byte ranges + path_data_starts, path_data_ends = engine._parquet_byte_ranges( + columns, + row_groups=row_groups, + footer=footer_samples[i], + footer_start=footer_starts[i], + ) + + data_paths += [path] * len(path_data_starts) + data_starts += path_data_starts + data_ends += path_data_ends + + # Merge adjacent offset ranges + data_paths, data_starts, data_ends = merge_offset_ranges( + data_paths, + data_starts, + data_ends, + max_gap=max_gap, + max_block=max_block, + sort=False, # Should already be sorted + ) + + # Start by populating `result` with footer samples + for i, path in enumerate(paths): + result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]} + + # Transfer the data byte-ranges into local memory + _transfer_ranges(fs, result, data_paths, data_starts, data_ends) + + # Add b"PAR1" to header if necessary + if add_header_magic: + _add_header_magic(result) + + return result + + +def _get_parquet_byte_ranges_from_metadata( + metadata, + fs, + engine, + columns=None, + row_groups=None, + max_gap=64_000, + max_block=256_000_000, +): + """Simplified version of `_get_parquet_byte_ranges` for + the case that an engine-specific `metadata` object is + provided, and the remote footer metadata does not need to + be transferred before calculating the required byte ranges. + """ + + # Use "engine" to collect data byte ranges + data_paths, data_starts, data_ends = engine._parquet_byte_ranges( + columns, + row_groups=row_groups, + metadata=metadata, + ) + + # Merge adjacent offset ranges + data_paths, data_starts, data_ends = merge_offset_ranges( + data_paths, + data_starts, + data_ends, + max_gap=max_gap, + max_block=max_block, + sort=False, # Should be sorted + ) + + # Transfer the data byte-ranges into local memory + result = {fn: {} for fn in list(set(data_paths))} + _transfer_ranges(fs, result, data_paths, data_starts, data_ends) + + # Add b"PAR1" to header + _add_header_magic(result) + + return result + + +def _transfer_ranges(fs, blocks, paths, starts, ends): + # Use cat_ranges to gather the data byte_ranges + ranges = (paths, starts, ends) + for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)): + blocks[path][(start, stop)] = data + + +def _add_header_magic(data): + # Add b"PAR1" to file headers + for path in list(data.keys()): + add_magic = True + for k in data[path]: + if k[0] == 0 and k[1] >= 4: + add_magic = False + break + if add_magic: + data[path][(0, 4)] = b"PAR1" + + +def _set_engine(engine_str): + # Define a list of parquet engines to try + if engine_str == "auto": + try_engines = ("fastparquet", "pyarrow") + elif not isinstance(engine_str, str): + raise ValueError( + "Failed to set parquet engine! " + "Please pass 'fastparquet', 'pyarrow', or 'auto'" + ) + elif engine_str not in ("fastparquet", "pyarrow"): + raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`") + else: + try_engines = [engine_str] + + # Try importing the engines in `try_engines`, + # and choose the first one that succeeds + for engine in try_engines: + try: + if engine == "fastparquet": + return FastparquetEngine() + elif engine == "pyarrow": + return PyarrowEngine() + except ImportError: + pass + + # Raise an error if a supported parquet engine + # was not found + raise ImportError( + f"The following parquet engines are not installed " + f"in your python environment: {try_engines}." + f"Please install 'fastparquert' or 'pyarrow' to " + f"utilize the `fsspec.parquet` module." + ) + + +class FastparquetEngine: + # The purpose of the FastparquetEngine class is + # to check if fastparquet can be imported (on initialization) + # and to define a `_parquet_byte_ranges` method. In the + # future, this class may also be used to define other + # methods/logic that are specific to fastparquet. + + def __init__(self): + import fastparquet as fp + + self.fp = fp + + def _row_group_filename(self, row_group, pf): + return pf.row_group_filename(row_group) + + def _parquet_byte_ranges( + self, + columns, + row_groups=None, + metadata=None, + footer=None, + footer_start=None, + ): + # Initialize offset ranges and define ParqetFile metadata + pf = metadata + data_paths, data_starts, data_ends = [], [], [] + if pf is None: + pf = self.fp.ParquetFile(io.BytesIO(footer)) + + # Convert columns to a set and add any index columns + # specified in the pandas metadata (just in case) + column_set = None if columns is None else set(columns) + if column_set is not None and hasattr(pf, "pandas_metadata"): + md_index = [ + ind + for ind in pf.pandas_metadata.get("index_columns", []) + # Ignore RangeIndex information + if not isinstance(ind, dict) + ] + column_set |= set(md_index) + + # Check if row_groups is a list of integers + # or a list of row-group metadata + if row_groups and not isinstance(row_groups[0], int): + # Input row_groups contains row-group metadata + row_group_indices = None + else: + # Input row_groups contains row-group indices + row_group_indices = row_groups + row_groups = pf.row_groups + + # Loop through column chunks to add required byte ranges + for r, row_group in enumerate(row_groups): + # Skip this row-group if we are targeting + # specific row-groups + if row_group_indices is None or r in row_group_indices: + # Find the target parquet-file path for `row_group` + fn = self._row_group_filename(row_group, pf) + + for column in row_group.columns: + name = column.meta_data.path_in_schema[0] + # Skip this column if we are targeting a + # specific columns + if column_set is None or name in column_set: + file_offset0 = column.meta_data.dictionary_page_offset + if file_offset0 is None: + file_offset0 = column.meta_data.data_page_offset + num_bytes = column.meta_data.total_compressed_size + if footer_start is None or file_offset0 < footer_start: + data_paths.append(fn) + data_starts.append(file_offset0) + data_ends.append( + min( + file_offset0 + num_bytes, + footer_start or (file_offset0 + num_bytes), + ) + ) + + if metadata: + # The metadata in this call may map to multiple + # file paths. Need to include `data_paths` + return data_paths, data_starts, data_ends + return data_starts, data_ends + + +class PyarrowEngine: + # The purpose of the PyarrowEngine class is + # to check if pyarrow can be imported (on initialization) + # and to define a `_parquet_byte_ranges` method. In the + # future, this class may also be used to define other + # methods/logic that are specific to pyarrow. + + def __init__(self): + import pyarrow.parquet as pq + + self.pq = pq + + def _row_group_filename(self, row_group, metadata): + raise NotImplementedError + + def _parquet_byte_ranges( + self, + columns, + row_groups=None, + metadata=None, + footer=None, + footer_start=None, + ): + if metadata is not None: + raise ValueError("metadata input not supported for PyarrowEngine") + + data_starts, data_ends = [], [] + md = self.pq.ParquetFile(io.BytesIO(footer)).metadata + + # Convert columns to a set and add any index columns + # specified in the pandas metadata (just in case) + column_set = None if columns is None else set(columns) + if column_set is not None: + schema = md.schema.to_arrow_schema() + has_pandas_metadata = ( + schema.metadata is not None and b"pandas" in schema.metadata + ) + if has_pandas_metadata: + md_index = [ + ind + for ind in json.loads( + schema.metadata[b"pandas"].decode("utf8") + ).get("index_columns", []) + # Ignore RangeIndex information + if not isinstance(ind, dict) + ] + column_set |= set(md_index) + + # Loop through column chunks to add required byte ranges + for r in range(md.num_row_groups): + # Skip this row-group if we are targeting + # specific row-groups + if row_groups is None or r in row_groups: + row_group = md.row_group(r) + for c in range(row_group.num_columns): + column = row_group.column(c) + name = column.path_in_schema + # Skip this column if we are targeting a + # specific columns + split_name = name.split(".")[0] + if ( + column_set is None + or name in column_set + or split_name in column_set + ): + file_offset0 = column.dictionary_page_offset + if file_offset0 is None: + file_offset0 = column.data_page_offset + num_bytes = column.total_compressed_size + if file_offset0 < footer_start: + data_starts.append(file_offset0) + data_ends.append( + min(file_offset0 + num_bytes, footer_start) + ) + return data_starts, data_ends diff --git a/.venv/lib/python3.11/site-packages/fsspec/registry.py b/.venv/lib/python3.11/site-packages/fsspec/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..5d104f266e273aa45084de869d04737ad1d5d56a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/registry.py @@ -0,0 +1,315 @@ +from __future__ import annotations + +import importlib +import types +import warnings + +__all__ = ["registry", "get_filesystem_class", "default"] + +# internal, mutable +_registry: dict[str, type] = {} + +# external, immutable +registry = types.MappingProxyType(_registry) +default = "file" + + +def register_implementation(name, cls, clobber=False, errtxt=None): + """Add implementation class to the registry + + Parameters + ---------- + name: str + Protocol name to associate with the class + cls: class or str + if a class: fsspec-compliant implementation class (normally inherits from + ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a + str, the full path to an implementation class like package.module.class, + which gets added to known_implementations, + so the import is deferred until the filesystem is actually used. + clobber: bool (optional) + Whether to overwrite a protocol with the same name; if False, will raise + instead. + errtxt: str (optional) + If given, then a failure to import the given class will result in this + text being given. + """ + if isinstance(cls, str): + if name in known_implementations and clobber is False: + if cls != known_implementations[name]["class"]: + raise ValueError( + f"Name ({name}) already in the known_implementations and clobber " + f"is False" + ) + else: + known_implementations[name] = { + "class": cls, + "err": errtxt or f"{cls} import failed for protocol {name}", + } + + else: + if name in registry and clobber is False: + if _registry[name] is not cls: + raise ValueError( + f"Name ({name}) already in the registry and clobber is False" + ) + else: + _registry[name] = cls + + +# protocols mapped to the class which implements them. This dict can be +# updated with register_implementation +known_implementations = { + "abfs": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, + "adl": { + "class": "adlfs.AzureDatalakeFileSystem", + "err": "Install adlfs to access Azure Datalake Gen1", + }, + "arrow_hdfs": { + "class": "fsspec.implementations.arrow.HadoopFileSystem", + "err": "pyarrow and local java libraries required for HDFS", + }, + "asynclocal": { + "class": "morefs.asyn_local.AsyncLocalFileSystem", + "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem", + }, + "az": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, + "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, + "box": { + "class": "boxfs.BoxFileSystem", + "err": "Please install boxfs to access BoxFileSystem", + }, + "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, + "dask": { + "class": "fsspec.implementations.dask.DaskWorkerFileSystem", + "err": "Install dask distributed to access worker file system", + }, + "data": {"class": "fsspec.implementations.data.DataFileSystem"}, + "dbfs": { + "class": "fsspec.implementations.dbfs.DatabricksFileSystem", + "err": "Install the requests package to use the DatabricksFileSystem", + }, + "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"}, + "dropbox": { + "class": "dropboxdrivefs.DropboxDriveFileSystem", + "err": ( + 'DropboxFileSystem requires "dropboxdrivefs","requests" and "' + '"dropbox" to be installed' + ), + }, + "dvc": { + "class": "dvc.api.DVCFileSystem", + "err": "Install dvc to access DVCFileSystem", + }, + "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, + "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, + "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, + "gcs": { + "class": "gcsfs.GCSFileSystem", + "err": "Please install gcsfs to access Google Storage", + }, + "gdrive": { + "class": "gdrivefs.GoogleDriveFileSystem", + "err": "Please install gdrivefs for access to Google Drive", + }, + "generic": {"class": "fsspec.generic.GenericFileSystem"}, + "git": { + "class": "fsspec.implementations.git.GitFileSystem", + "err": "Install pygit2 to browse local git repos", + }, + "github": { + "class": "fsspec.implementations.github.GithubFileSystem", + "err": "Install the requests package to use the github FS", + }, + "gs": { + "class": "gcsfs.GCSFileSystem", + "err": "Please install gcsfs to access Google Storage", + }, + "hdfs": { + "class": "fsspec.implementations.arrow.HadoopFileSystem", + "err": "pyarrow and local java libraries required for HDFS", + }, + "hf": { + "class": "huggingface_hub.HfFileSystem", + "err": "Install huggingface_hub to access HfFileSystem", + }, + "http": { + "class": "fsspec.implementations.http.HTTPFileSystem", + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', + }, + "https": { + "class": "fsspec.implementations.http.HTTPFileSystem", + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', + }, + "jlab": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, + "jupyter": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, + "lakefs": { + "class": "lakefs_spec.LakeFSFileSystem", + "err": "Please install lakefs-spec to access LakeFSFileSystem", + }, + "libarchive": { + "class": "fsspec.implementations.libarchive.LibArchiveFileSystem", + "err": "LibArchive requires to be installed", + }, + "local": {"class": "fsspec.implementations.local.LocalFileSystem"}, + "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, + "oci": { + "class": "ocifs.OCIFileSystem", + "err": "Install ocifs to access OCI Object Storage", + }, + "ocilake": { + "class": "ocifs.OCIFileSystem", + "err": "Install ocifs to access OCI Data Lake", + }, + "oss": { + "class": "ossfs.OSSFileSystem", + "err": "Install ossfs to access Alibaba Object Storage System", + }, + "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"}, + "root": { + "class": "fsspec_xrootd.XRootDFileSystem", + "err": ( + "Install fsspec-xrootd to access xrootd storage system. " + "Note: 'root' is the protocol name for xrootd storage systems, " + "not referring to root directories" + ), + }, + "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, + "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, + "sftp": { + "class": "fsspec.implementations.sftp.SFTPFileSystem", + "err": 'SFTPFileSystem requires "paramiko" to be installed', + }, + "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"}, + "smb": { + "class": "fsspec.implementations.smb.SMBFileSystem", + "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed', + }, + "ssh": { + "class": "fsspec.implementations.sftp.SFTPFileSystem", + "err": 'SFTPFileSystem requires "paramiko" to be installed', + }, + "tar": {"class": "fsspec.implementations.tar.TarFileSystem"}, + "tosfs": { + "class": "tosfs.TosFileSystem", + "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage", + }, + "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"}, + "webdav": { + "class": "webdav4.fsspec.WebdavFileSystem", + "err": "Install webdav4 to access WebDAV", + }, + "webhdfs": { + "class": "fsspec.implementations.webhdfs.WebHDFS", + "err": 'webHDFS access requires "requests" to be installed', + }, + "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, +} + +assert list(known_implementations) == sorted(known_implementations), ( + "Not in alphabetical order" +) + + +def get_filesystem_class(protocol): + """Fetch named protocol implementation from the registry + + The dict ``known_implementations`` maps protocol names to the locations + of classes implementing the corresponding file-system. When used for the + first time, appropriate imports will happen and the class will be placed in + the registry. All subsequent calls will fetch directly from the registry. + + Some protocol implementations require additional dependencies, and so the + import may fail. In this case, the string in the "err" field of the + ``known_implementations`` will be given as the error message. + """ + if not protocol: + protocol = default + + if protocol not in registry: + if protocol not in known_implementations: + raise ValueError(f"Protocol not known: {protocol}") + bit = known_implementations[protocol] + try: + register_implementation(protocol, _import_class(bit["class"])) + except ImportError as e: + raise ImportError(bit["err"]) from e + cls = registry[protocol] + if getattr(cls, "protocol", None) in ("abstract", None): + cls.protocol = protocol + + return cls + + +s3_msg = """Your installed version of s3fs is very old and known to cause +severe performance issues, see also https://github.com/dask/dask/issues/10276 + +To fix, you should specify a lower version bound on s3fs, or +update the current installation. +""" + + +def _import_class(fqp: str): + """Take a fully-qualified path and return the imported class or identifier. + + ``fqp`` is of the form "package.module.klass" or + "package.module:subobject.klass". + + Warnings + -------- + This can import arbitrary modules. Make sure you haven't installed any modules + that may execute malicious code at import time. + """ + if ":" in fqp: + mod, name = fqp.rsplit(":", 1) + else: + mod, name = fqp.rsplit(".", 1) + + is_s3 = mod == "s3fs" + mod = importlib.import_module(mod) + if is_s3 and mod.__version__.split(".") < ["0", "5"]: + warnings.warn(s3_msg) + for part in name.split("."): + mod = getattr(mod, part) + + if not isinstance(mod, type): + raise TypeError(f"{fqp} is not a class") + + return mod + + +def filesystem(protocol, **storage_options): + """Instantiate filesystems for given protocol and arguments + + ``storage_options`` are specific to the protocol being chosen, and are + passed directly to the class. + """ + if protocol == "arrow_hdfs": + warnings.warn( + "The 'arrow_hdfs' protocol has been deprecated and will be " + "removed in the future. Specify it as 'hdfs'.", + DeprecationWarning, + ) + + cls = get_filesystem_class(protocol) + return cls(**storage_options) + + +def available_protocols(): + """Return a list of the implemented protocols. + + Note that any given protocol may require extra packages to be importable. + """ + return list(known_implementations) diff --git a/.venv/lib/python3.11/site-packages/fsspec/spec.py b/.venv/lib/python3.11/site-packages/fsspec/spec.py new file mode 100644 index 0000000000000000000000000000000000000000..d83cb099d02baca9d3fb665cbe992f22f3e0c1d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/spec.py @@ -0,0 +1,2242 @@ +from __future__ import annotations + +import io +import json +import logging +import os +import threading +import warnings +import weakref +from errno import ESPIPE +from glob import has_magic +from hashlib import sha256 +from typing import Any, ClassVar + +from .callbacks import DEFAULT_CALLBACK +from .config import apply_config, conf +from .dircache import DirCache +from .transaction import Transaction +from .utils import ( + _unstrip_protocol, + glob_translate, + isfilelike, + other_paths, + read_block, + stringify_path, + tokenize, +) + +logger = logging.getLogger("fsspec") + + +def make_instance(cls, args, kwargs): + return cls(*args, **kwargs) + + +class _Cached(type): + """ + Metaclass for caching file system instances. + + Notes + ----- + Instances are cached according to + + * The values of the class attributes listed in `_extra_tokenize_attributes` + * The arguments passed to ``__init__``. + + This creates an additional reference to the filesystem, which prevents the + filesystem from being garbage collected when all *user* references go away. + A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* + be made for a filesystem instance to be garbage collected. + """ + + def __init__(cls, *args, **kwargs): + super().__init__(*args, **kwargs) + # Note: we intentionally create a reference here, to avoid garbage + # collecting instances when all other references are gone. To really + # delete a FileSystem, the cache must be cleared. + if conf.get("weakref_instance_cache"): # pragma: no cover + # debug option for analysing fork/spawn conditions + cls._cache = weakref.WeakValueDictionary() + else: + cls._cache = {} + cls._pid = os.getpid() + + def __call__(cls, *args, **kwargs): + kwargs = apply_config(cls, kwargs) + extra_tokens = tuple( + getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes + ) + token = tokenize( + cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs + ) + skip = kwargs.pop("skip_instance_cache", False) + if os.getpid() != cls._pid: + cls._cache.clear() + cls._pid = os.getpid() + if not skip and cls.cachable and token in cls._cache: + cls._latest = token + return cls._cache[token] + else: + obj = super().__call__(*args, **kwargs) + # Setting _fs_token here causes some static linters to complain. + obj._fs_token_ = token + obj.storage_args = args + obj.storage_options = kwargs + if obj.async_impl and obj.mirror_sync_methods: + from .asyn import mirror_sync_methods + + mirror_sync_methods(obj) + + if cls.cachable and not skip: + cls._latest = token + cls._cache[token] = obj + return obj + + +class AbstractFileSystem(metaclass=_Cached): + """ + An abstract super-class for pythonic file-systems + + Implementations are expected to be compatible with or, better, subclass + from here. + """ + + cachable = True # this class can be cached, instances reused + _cached = False + blocksize = 2**22 + sep = "/" + protocol: ClassVar[str | tuple[str, ...]] = "abstract" + _latest = None + async_impl = False + mirror_sync_methods = False + root_marker = "" # For some FSs, may require leading '/' or other character + transaction_type = Transaction + + #: Extra *class attributes* that should be considered when hashing. + _extra_tokenize_attributes = () + + # Set by _Cached metaclass + storage_args: tuple[Any, ...] + storage_options: dict[str, Any] + + def __init__(self, *args, **storage_options): + """Create and configure file-system instance + + Instances may be cachable, so if similar enough arguments are seen + a new instance is not required. The token attribute exists to allow + implementations to cache instances if they wish. + + A reasonable default should be provided if there are no arguments. + + Subclasses should call this method. + + Parameters + ---------- + use_listings_cache, listings_expiry_time, max_paths: + passed to ``DirCache``, if the implementation supports + directory listing caching. Pass use_listings_cache=False + to disable such caching. + skip_instance_cache: bool + If this is a cachable implementation, pass True here to force + creating a new instance even if a matching instance exists, and prevent + storing this instance. + asynchronous: bool + loop: asyncio-compatible IOLoop or None + """ + if self._cached: + # reusing instance, don't change + return + self._cached = True + self._intrans = False + self._transaction = None + self._invalidated_caches_in_transaction = [] + self.dircache = DirCache(**storage_options) + + if storage_options.pop("add_docs", None): + warnings.warn("add_docs is no longer supported.", FutureWarning) + + if storage_options.pop("add_aliases", None): + warnings.warn("add_aliases has been removed.", FutureWarning) + # This is set in _Cached + self._fs_token_ = None + + @property + def fsid(self): + """Persistent filesystem id that can be used to compare filesystems + across sessions. + """ + raise NotImplementedError + + @property + def _fs_token(self): + return self._fs_token_ + + def __dask_tokenize__(self): + return self._fs_token + + def __hash__(self): + return int(self._fs_token, 16) + + def __eq__(self, other): + return isinstance(other, type(self)) and self._fs_token == other._fs_token + + def __reduce__(self): + return make_instance, (type(self), self.storage_args, self.storage_options) + + @classmethod + def _strip_protocol(cls, path): + """Turn path from fully-qualified to file-system-specific + + May require FS-specific handling, e.g., for relative paths or links. + """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path = stringify_path(path) + protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol + for protocol in protos: + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.rstrip("/") + # use of root_marker to make minimum required path, e.g., "/" + return path or cls.root_marker + + def unstrip_protocol(self, name: str) -> str: + """Format FS-specific path to generic, including protocol""" + protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol + for protocol in protos: + if name.startswith(f"{protocol}://"): + return name + return f"{protos[0]}://{name}" + + @staticmethod + def _get_kwargs_from_urls(path): + """If kwargs can be encoded in the paths, extract them here + + This should happen before instantiation of the class; incoming paths + then should be amended to strip the options in methods. + + Examples may look like an sftp path "sftp://user@host:/my/path", where + the user and host should become kwargs and later get stripped. + """ + # by default, nothing happens + return {} + + @classmethod + def current(cls): + """Return the most recently instantiated FileSystem + + If no instance has been created, then create one with defaults + """ + if cls._latest in cls._cache: + return cls._cache[cls._latest] + return cls() + + @property + def transaction(self): + """A context within which files are committed together upon exit + + Requires the file class to implement `.commit()` and `.discard()` + for the normal and exception cases. + """ + if self._transaction is None: + self._transaction = self.transaction_type(self) + return self._transaction + + def start_transaction(self): + """Begin write transaction for deferring files, non-context version""" + self._intrans = True + self._transaction = self.transaction_type(self) + return self.transaction + + def end_transaction(self): + """Finish write transaction, non-context version""" + self.transaction.complete() + self._transaction = None + # The invalid cache must be cleared after the transaction is completed. + for path in self._invalidated_caches_in_transaction: + self.invalidate_cache(path) + self._invalidated_caches_in_transaction.clear() + + def invalidate_cache(self, path=None): + """ + Discard any cached directory information + + Parameters + ---------- + path: string or None + If None, clear all listings cached else listings at or under given + path. + """ + # Not necessary to implement invalidation mechanism, may have no cache. + # But if have, you should call this method of parent class from your + # subclass to ensure expiring caches after transacations correctly. + # See the implementation of FTPFileSystem in ftp.py + if self._intrans: + self._invalidated_caches_in_transaction.append(path) + + def mkdir(self, path, create_parents=True, **kwargs): + """ + Create directory entry at path + + For systems that don't have true directories, may create an for + this instance only and not touch the real filesystem + + Parameters + ---------- + path: str + location + create_parents: bool + if True, this is equivalent to ``makedirs`` + kwargs: + may be permissions, etc. + """ + pass # not necessary to implement, may not have directories + + def makedirs(self, path, exist_ok=False): + """Recursively make directories + + Creates directory at path and any intervening required directories. + Raises exception if, for instance, the path already exists but is a + file. + + Parameters + ---------- + path: str + leaf directory name + exist_ok: bool (False) + If False, will error if the target already exists + """ + pass # not necessary to implement, may not have directories + + def rmdir(self, path): + """Remove a directory, if empty""" + pass # not necessary to implement, may not have directories + + def ls(self, path, detail=True, **kwargs): + """List objects at path. + + This should include subdirectories and files at that location. The + difference between a file and a directory must be clear when details + are requested. + + The specific keys, or perhaps a FileInfo class, or similar, is TBD, + but must be consistent across implementations. + Must include: + + - full path to the entry (without protocol) + - size of the entry, in bytes. If the value cannot be determined, will + be ``None``. + - type of entry, "file", "directory" or other + + Additional information + may be present, appropriate to the file-system, e.g., generation, + checksum, etc. + + May use refresh=True|False to allow use of self._ls_from_cache to + check for a saved listing and avoid calling the backend. This would be + common where listing may be expensive. + + Parameters + ---------- + path: str + detail: bool + if True, gives a list of dictionaries, where each is the same as + the result of ``info(path)``. If False, gives a list of paths + (str). + kwargs: may have additional backend-specific options, such as version + information + + Returns + ------- + List of strings if detail is False, or list of directory information + dicts if detail is True. + """ + raise NotImplementedError + + def _ls_from_cache(self, path): + """Check cache for listing + + Returns listing, if found (may be empty list for a directly that exists + but contains nothing), None if not in cache. + """ + parent = self._parent(path) + try: + return self.dircache[path.rstrip("/")] + except KeyError: + pass + try: + files = [ + f + for f in self.dircache[parent] + if f["name"] == path + or (f["name"] == path.rstrip("/") and f["type"] == "directory") + ] + if len(files) == 0: + # parent dir was listed but did not contain this file + raise FileNotFoundError(path) + return files + except KeyError: + pass + + def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs): + """Return all files under the given path. + + List all files, recursing into subdirectories; output is iterator-style, + like ``os.walk()``. For a simple list of files, ``find()`` is available. + + When topdown is True, the caller can modify the dirnames list in-place (perhaps + using del or slice assignment), and walk() will + only recurse into the subdirectories whose names remain in dirnames; + this can be used to prune the search, impose a specific order of visiting, + or even to inform walk() about directories the caller creates or renames before + it resumes walk() again. + Modifying dirnames when topdown is False has no effect. (see os.walk) + + Note that the "files" outputted will include anything that is not + a directory, such as links. + + Parameters + ---------- + path: str + Root to recurse into + maxdepth: int + Maximum recursion depth. None means limitless, but not recommended + on link-based file-systems. + topdown: bool (True) + Whether to walk the directory tree from the top downwards or from + the bottom upwards. + on_error: "omit", "raise", a callable + if omit (default), path with exception will simply be empty; + If raise, an underlying exception will be raised; + if callable, it will be called with a single OSError instance as argument + kwargs: passed to ``ls`` + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + path = self._strip_protocol(path) + full_dirs = {} + dirs = {} + files = {} + + detail = kwargs.pop("detail", False) + try: + listing = self.ls(path, detail=True, **kwargs) + except (FileNotFoundError, OSError) as e: + if on_error == "raise": + raise + if callable(on_error): + on_error(e) + return + + for info in listing: + # each info name must be at least [path]/part , but here + # we check also for names like [path]/part/ + pathname = info["name"].rstrip("/") + name = pathname.rsplit("/", 1)[-1] + if info["type"] == "directory" and pathname != path: + # do not include "self" path + full_dirs[name] = pathname + dirs[name] = info + elif pathname == path: + # file-like with same name as give path + files[""] = info + else: + files[name] = info + + if not detail: + dirs = list(dirs) + files = list(files) + + if topdown: + # Yield before recursion if walking top down + yield path, dirs, files + + if maxdepth is not None: + maxdepth -= 1 + if maxdepth < 1: + if not topdown: + yield path, dirs, files + return + + for d in dirs: + yield from self.walk( + full_dirs[d], + maxdepth=maxdepth, + detail=detail, + topdown=topdown, + **kwargs, + ) + + if not topdown: + # Yield after recursion if walking bottom up + yield path, dirs, files + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + """List all files below path. + + Like posix ``find`` command without conditions + + Parameters + ---------- + path : str + maxdepth: int or None + If not None, the maximum number of levels to descend + withdirs: bool + Whether to include directory paths in the output. This is True + when used by glob, but users usually only want files. + kwargs are passed to ``ls``. + """ + # TODO: allow equivalent of -name parameter + path = self._strip_protocol(path) + out = {} + + # Add the root directory if withdirs is requested + # This is needed for posix glob compliance + if withdirs and path != "" and self.isdir(path): + out[path] = self.info(path) + + for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs): + if withdirs: + files.update(dirs) + out.update({info["name"]: info for name, info in files.items()}) + if not out and self.isfile(path): + # walk works on directories, but find should also return [path] + # when path happens to be a file + out[path] = {} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} + + def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs): + """Space used by files and optionally directories within a path + + Directory size does not include the size of its contents. + + Parameters + ---------- + path: str + total: bool + Whether to sum all the file sizes + maxdepth: int or None + Maximum number of directory levels to descend, None for unlimited. + withdirs: bool + Whether to include directory paths in the output. + kwargs: passed to ``find`` + + Returns + ------- + Dict of {path: size} if total=False, or int otherwise, where numbers + refer to bytes used. + """ + sizes = {} + if withdirs and self.isdir(path): + # Include top-level directory in output + info = self.info(path) + sizes[info["name"]] = info["size"] + for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs): + info = self.info(f) + sizes[info["name"]] = info["size"] + if total: + return sum(sizes.values()) + else: + return sizes + + def glob(self, path, maxdepth=None, **kwargs): + """ + Find files by glob-matching. + + If the path ends with '/', only folders are returned. + + We support ``"**"``, + ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation. + + The `maxdepth` option is applied on the first `**` found in the path. + + kwargs are passed to ``ls``. + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + import re + + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_qmark, idx_brace) + + detail = kwargs.pop("detail", False) + + if not has_magic(path): + if self.exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: self.info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) + pattern = re.compile(pattern) + + out = { + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + } + + if detail: + return out + else: + return list(out) + + def exists(self, path, **kwargs): + """Is there a file at the given path""" + try: + self.info(path, **kwargs) + return True + except: # noqa: E722 + # any exception allowed bar FileNotFoundError? + return False + + def lexists(self, path, **kwargs): + """If there is a file at the given path (including + broken links)""" + return self.exists(path) + + def info(self, path, **kwargs): + """Give details of entry at path + + Returns a single dictionary, with exactly the same information as ``ls`` + would with ``detail=True``. + + The default implementation calls ls and could be overridden by a + shortcut. kwargs are passed on to ```ls()``. + + Some file systems might not be able to measure the file's size, in + which case, the returned dict will include ``'size': None``. + + Returns + ------- + dict with keys: name (full path in the FS), size (in bytes), type (file, + directory, or something else) and other FS-specific keys. + """ + path = self._strip_protocol(path) + out = self.ls(self._parent(path), detail=True, **kwargs) + out = [o for o in out if o["name"].rstrip("/") == path] + if out: + return out[0] + out = self.ls(path, detail=True, **kwargs) + path = path.rstrip("/") + out1 = [o for o in out if o["name"].rstrip("/") == path] + if len(out1) == 1: + if "size" not in out1[0]: + out1[0]["size"] = None + return out1[0] + elif len(out1) > 1 or out: + return {"name": path, "size": 0, "type": "directory"} + else: + raise FileNotFoundError(path) + + def checksum(self, path): + """Unique value for current version of file + + If the checksum is the same from one moment to another, the contents + are guaranteed to be the same. If the checksum changes, the contents + *might* have changed. + + This should normally be overridden; default will probably capture + creation/modification timestamp (which would be good) or maybe + access timestamp (which would be bad) + """ + return int(tokenize(self.info(path)), 16) + + def size(self, path): + """Size in bytes of file""" + return self.info(path).get("size", None) + + def sizes(self, paths): + """Size in bytes of each file in a list of paths""" + return [self.size(p) for p in paths] + + def isdir(self, path): + """Is this entry directory-like?""" + try: + return self.info(path)["type"] == "directory" + except OSError: + return False + + def isfile(self, path): + """Is this entry file-like?""" + try: + return self.info(path)["type"] == "file" + except: # noqa: E722 + return False + + def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs): + """Get the contents of the file as a string. + + Parameters + ---------- + path: str + URL of file on this filesystems + encoding, errors, newline: same as `open`. + """ + with self.open( + path, + mode="r", + encoding=encoding, + errors=errors, + newline=newline, + **kwargs, + ) as f: + return f.read() + + def write_text( + self, path, value, encoding=None, errors=None, newline=None, **kwargs + ): + """Write the text to the given file. + + An existing file will be overwritten. + + Parameters + ---------- + path: str + URL of file on this filesystems + value: str + Text to write. + encoding, errors, newline: same as `open`. + """ + with self.open( + path, + mode="w", + encoding=encoding, + errors=errors, + newline=newline, + **kwargs, + ) as f: + return f.write(value) + + def cat_file(self, path, start=None, end=None, **kwargs): + """Get the content of a file + + Parameters + ---------- + path: URL of file on this filesystems + start, end: int + Bytes limits of the read. If negative, backwards from end, + like usual python slices. Either can be None for start or + end of file, respectively + kwargs: passed to ``open()``. + """ + # explicitly set buffering off? + with self.open(path, "rb", **kwargs) as f: + if start is not None: + if start >= 0: + f.seek(start) + else: + f.seek(max(0, f.size + start)) + if end is not None: + if end < 0: + end = f.size + end + return f.read(end - f.tell()) + return f.read() + + def pipe_file(self, path, value, mode="overwrite", **kwargs): + """Set the bytes of given file""" + if mode == "create" and self.exists(path): + # non-atomic but simple way; or could use "xb" in open(), which is likely + # not as well supported + raise FileExistsError + with self.open(path, "wb", **kwargs) as f: + f.write(value) + + def pipe(self, path, value=None, **kwargs): + """Put value into path + + (counterpart to ``cat``) + + Parameters + ---------- + path: string or dict(str, bytes) + If a string, a single remote location to put ``value`` bytes; if a dict, + a mapping of {path: bytesvalue}. + value: bytes, optional + If using a single path, these are the bytes to put there. Ignored if + ``path`` is a dict + """ + if isinstance(path, str): + self.pipe_file(self._strip_protocol(path), value, **kwargs) + elif isinstance(path, dict): + for k, v in path.items(): + self.pipe_file(self._strip_protocol(k), v, **kwargs) + else: + raise ValueError("path must be str or dict") + + def cat_ranges( + self, paths, starts, ends, max_gap=None, on_error="return", **kwargs + ): + """Get the contents of byte ranges from one or more files + + Parameters + ---------- + paths: list + A list of of filepaths on this filesystems + starts, ends: int or list + Bytes limits of the read. If using a single int, the same value will be + used to read all the specified files. + """ + if max_gap is not None: + raise NotImplementedError + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, list): + starts = [starts] * len(paths) + if not isinstance(ends, list): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + out = [] + for p, s, e in zip(paths, starts, ends): + try: + out.append(self.cat_file(p, s, e)) + except Exception as e: + if on_error == "return": + out.append(e) + else: + raise + return out + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + """Fetch (potentially multiple) paths' contents + + Parameters + ---------- + recursive: bool + If True, assume the path(s) are directories, and get all the + contained files + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + kwargs: passed to cat_file + + Returns + ------- + dict of {path: contents} if there are multiple paths + or the path has been otherwise expanded + """ + paths = self.expand_path(path, recursive=recursive) + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + out = {} + for path in paths: + try: + out[path] = self.cat_file(path, **kwargs) + except Exception as e: + if on_error == "raise": + raise + if on_error == "return": + out[path] = e + return out + else: + return self.cat_file(paths[0], **kwargs) + + def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs): + """Copy single remote file to local""" + from .implementations.local import LocalFileSystem + + if isfilelike(lpath): + outfile = lpath + elif self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) + return None + + fs = LocalFileSystem(auto_mkdir=True) + fs.makedirs(fs._parent(lpath), exist_ok=True) + + with self.open(rpath, "rb", **kwargs) as f1: + if outfile is None: + outfile = open(lpath, "wb") + + try: + callback.set_size(getattr(f1, "size", None)) + data = True + while data: + data = f1.read(self.blocksize) + segment_len = outfile.write(data) + if segment_len is None: + segment_len = len(data) + callback.relative_update(segment_len) + finally: + if not isfilelike(lpath): + outfile.close() + + def get( + self, + rpath, + lpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) to local. + + Copies a specific file or tree of files (if recursive=True). If lpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. Can submit a list of paths, which may be glob-patterns + and will be expanded. + + Calls get_file for each source. + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + from .implementations.local import ( + LocalFileSystem, + make_path_posix, + trailing_sep, + ) + + source_is_str = isinstance(rpath, str) + rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))] + if not rpaths: + return + + if isinstance(lpath, str): + lpath = make_path_posix(lpath) + + source_is_file = len(rpaths) == 1 + dest_is_dir = isinstance(lpath, str) and ( + trailing_sep(lpath) or LocalFileSystem().isdir(lpath) + ) + + exists = source_is_str and ( + (has_magic(rpath) and source_is_file) + or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath)) + ) + lpaths = other_paths( + rpaths, + lpath, + exists=exists, + flatten=not source_is_str, + ) + + callback.set_size(len(lpaths)) + for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): + with callback.branched(rpath, lpath) as child: + self.get_file(rpath, lpath, callback=child, **kwargs) + + def put_file( + self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs + ): + """Copy single file to remote""" + if mode == "create" and self.exists(rpath): + raise FileExistsError + if os.path.isdir(lpath): + self.makedirs(rpath, exist_ok=True) + return None + + with open(lpath, "rb") as f1: + size = f1.seek(0, 2) + callback.set_size(size) + f1.seek(0) + + self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True) + with self.open(rpath, "wb", **kwargs) as f2: + while f1.tell() < size: + data = f1.read(self.blocksize) + segment_len = f2.write(data) + if segment_len is None: + segment_len = len(data) + callback.relative_update(segment_len) + + def put( + self, + lpath, + rpath, + recursive=False, + callback=DEFAULT_CALLBACK, + maxdepth=None, + **kwargs, + ): + """Copy file(s) from local. + + Copies a specific file or tree of files (if recursive=True). If rpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. + + Calls put_file for each source. + """ + if isinstance(lpath, list) and isinstance(rpath, list): + # No need to expand paths when both source and destination + # are provided as lists + rpaths = rpath + lpaths = lpath + else: + from .implementations.local import ( + LocalFileSystem, + make_path_posix, + trailing_sep, + ) + + source_is_str = isinstance(lpath, str) + if source_is_str: + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))] + if not lpaths: + return + + source_is_file = len(lpaths) == 1 + dest_is_dir = isinstance(rpath, str) and ( + trailing_sep(rpath) or self.isdir(rpath) + ) + + rpath = ( + self._strip_protocol(rpath) + if isinstance(rpath, str) + else [self._strip_protocol(p) for p in rpath] + ) + exists = source_is_str and ( + (has_magic(lpath) and source_is_file) + or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath)) + ) + rpaths = other_paths( + lpaths, + rpath, + exists=exists, + flatten=not source_is_str, + ) + + callback.set_size(len(rpaths)) + for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): + with callback.branched(lpath, rpath) as child: + self.put_file(lpath, rpath, callback=child, **kwargs) + + def head(self, path, size=1024): + """Get the first ``size`` bytes from file""" + with self.open(path, "rb") as f: + return f.read(size) + + def tail(self, path, size=1024): + """Get the last ``size`` bytes from file""" + with self.open(path, "rb") as f: + f.seek(max(-size, -f.size), 2) + return f.read() + + def cp_file(self, path1, path2, **kwargs): + raise NotImplementedError + + def copy( + self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs + ): + """Copy within two locations in the filesystem + + on_error : "raise", "ignore" + If raise, any not-found exceptions will be raised; if ignore any + not-found exceptions will cause the path to be skipped; defaults to + raise unless recursive is true, where the default is ignore + """ + if on_error is None and recursive: + on_error = "ignore" + elif on_error is None: + on_error = "raise" + + if isinstance(path1, list) and isinstance(path2, list): + # No need to expand paths when both source and destination + # are provided as lists + paths1 = path1 + paths2 = path2 + else: + from .implementations.local import trailing_sep + + source_is_str = isinstance(path1, str) + paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth) + if source_is_str and (not recursive or maxdepth is not None): + # Non-recursive glob does not copy directories + paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))] + if not paths1: + return + + source_is_file = len(paths1) == 1 + dest_is_dir = isinstance(path2, str) and ( + trailing_sep(path2) or self.isdir(path2) + ) + + exists = source_is_str and ( + (has_magic(path1) and source_is_file) + or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1)) + ) + paths2 = other_paths( + paths1, + path2, + exists=exists, + flatten=not source_is_str, + ) + + for p1, p2 in zip(paths1, paths2): + try: + self.cp_file(p1, p2, **kwargs) + except FileNotFoundError: + if on_error == "raise": + raise + + def expand_path(self, path, recursive=False, maxdepth=None, **kwargs): + """Turn one or more globs or directories into a list of all matching paths + to files or directories. + + kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls`` + """ + + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + if isinstance(path, (str, os.PathLike)): + out = self.expand_path([path], recursive, maxdepth) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: + if has_magic(p): + bit = set(self.glob(p, maxdepth=maxdepth, **kwargs)) + out |= bit + if recursive: + # glob call above expanded one depth so if maxdepth is defined + # then decrement it in expand_path call below. If it is zero + # after decrementing then avoid expand_path call. + if maxdepth is not None and maxdepth <= 1: + continue + out |= set( + self.expand_path( + list(bit), + recursive=recursive, + maxdepth=maxdepth - 1 if maxdepth is not None else None, + **kwargs, + ) + ) + continue + elif recursive: + rec = set( + self.find( + p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs + ) + ) + out |= rec + if p not in out and (recursive is False or self.exists(p)): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return sorted(out) + + def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): + """Move file(s) from one location to another""" + if path1 == path2: + logger.debug("%s mv: The paths are the same, so no files were moved.", self) + else: + # explicitly raise exception to prevent data corruption + self.copy( + path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise" + ) + self.rm(path1, recursive=recursive) + + def rm_file(self, path): + """Delete a file""" + self._rm(path) + + def _rm(self, path): + """Delete one file""" + # this is the old name for the method, prefer rm_file + raise NotImplementedError + + def rm(self, path, recursive=False, maxdepth=None): + """Delete files. + + Parameters + ---------- + path: str or list of str + File(s) to delete. + recursive: bool + If file(s) are directories, recursively delete contents and then + also remove the directory + maxdepth: int or None + Depth to pass to walk for finding files to delete, if recursive. + If None, there will be no limit and infinite recursion may be + possible. + """ + path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(path): + self.rm_file(p) + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if "/" in path: + parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) + return cls.root_marker + parent + else: + return cls.root_marker + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Return raw bytes-mode file-like from the file-system""" + return AbstractBufferedFile( + self, + path, + mode, + block_size, + autocommit, + cache_options=cache_options, + **kwargs, + ) + + def open( + self, + path, + mode="rb", + block_size=None, + cache_options=None, + compression=None, + **kwargs, + ): + """ + Return a file-like object from the filesystem + + The resultant instance must function correctly in a context ``with`` + block. + + Parameters + ---------- + path: str + Target file + mode: str like 'rb', 'w' + See builtin ``open()`` + Mode "x" (exclusive write) may be implemented by the backend. Even if + it is, whether it is checked up front or on commit, and whether it is + atomic is implementation-dependent. + block_size: int + Some indication of buffering - this is a value in bytes + cache_options : dict, optional + Extra arguments to pass through to the cache. + compression: string or None + If given, open file using compression codec. Can either be a compression + name (a key in ``fsspec.compression.compr``) or "infer" to guess the + compression from the filename suffix. + encoding, errors, newline: passed on to TextIOWrapper for text mode + """ + import io + + path = self._strip_protocol(path) + if "b" not in mode: + mode = mode.replace("t", "") + "b" + + text_kwargs = { + k: kwargs.pop(k) + for k in ["encoding", "errors", "newline"] + if k in kwargs + } + return io.TextIOWrapper( + self.open( + path, + mode, + block_size=block_size, + cache_options=cache_options, + compression=compression, + **kwargs, + ), + **text_kwargs, + ) + else: + ac = kwargs.pop("autocommit", not self._intrans) + f = self._open( + path, + mode=mode, + block_size=block_size, + autocommit=ac, + cache_options=cache_options, + **kwargs, + ) + if compression is not None: + from fsspec.compression import compr + from fsspec.core import get_compression + + compression = get_compression(path, compression) + compress = compr[compression] + f = compress(f, mode=mode[0]) + + if not ac and "r" not in mode: + self.transaction.files.append(f) + return f + + def touch(self, path, truncate=True, **kwargs): + """Create empty file, or update timestamp + + Parameters + ---------- + path: str + file location + truncate: bool + If True, always set file size to 0; if False, update timestamp and + leave file unchanged, if backend allows this + """ + if truncate or not self.exists(path): + with self.open(path, "wb", **kwargs): + pass + else: + raise NotImplementedError # update timestamp, if possible + + def ukey(self, path): + """Hash of file properties, to tell if it has changed""" + return sha256(str(self.info(path)).encode()).hexdigest() + + def read_block(self, fn, offset, length, delimiter=None): + """Read a block of bytes from + + Starting at ``offset`` of the file, read ``length`` bytes. If + ``delimiter`` is set then we ensure that the read starts and stops at + delimiter boundaries that follow the locations ``offset`` and ``offset + + length``. If ``offset`` is zero then we start at zero. The + bytestring returned WILL include the end delimiter string. + + If offset+length is beyond the eof, reads to eof. + + Parameters + ---------- + fn: string + Path to filename + offset: int + Byte offset to start read + length: int + Number of bytes to read. If None, read to end. + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + + Examples + -------- + >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + Use ``length=None`` to read to the end of the file. + >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\nCharlie, 300' + + See Also + -------- + :func:`fsspec.utils.read_block` + """ + with self.open(fn, "rb") as f: + size = f.size + if length is None: + length = size + if size is not None and offset + length > size: + length = size - offset + return read_block(f, offset, length, delimiter) + + def to_json(self, *, include_password: bool = True) -> str: + """ + JSON representation of this filesystem instance. + + Parameters + ---------- + include_password: bool, default True + Whether to include the password (if any) in the output. + + Returns + ------- + JSON string with keys ``cls`` (the python location of this class), + protocol (text name of this class's protocol, first one in case of + multiple), ``args`` (positional args, usually empty), and all other + keyword arguments as their own keys. + + Warnings + -------- + Serialized filesystems may contain sensitive information which have been + passed to the constructor, such as passwords and tokens. Make sure you + store and send them in a secure environment! + """ + from .json import FilesystemJSONEncoder + + return json.dumps( + self, + cls=type( + "_FilesystemJSONEncoder", + (FilesystemJSONEncoder,), + {"include_password": include_password}, + ), + ) + + @staticmethod + def from_json(blob: str) -> AbstractFileSystem: + """ + Recreate a filesystem instance from JSON representation. + + See ``.to_json()`` for the expected structure of the input. + + Parameters + ---------- + blob: str + + Returns + ------- + file system instance, not necessarily of this particular class. + + Warnings + -------- + This can import arbitrary modules (as determined by the ``cls`` key). + Make sure you haven't installed any modules that may execute malicious code + at import time. + """ + from .json import FilesystemJSONDecoder + + return json.loads(blob, cls=FilesystemJSONDecoder) + + def to_dict(self, *, include_password: bool = True) -> dict[str, Any]: + """ + JSON-serializable dictionary representation of this filesystem instance. + + Parameters + ---------- + include_password: bool, default True + Whether to include the password (if any) in the output. + + Returns + ------- + Dictionary with keys ``cls`` (the python location of this class), + protocol (text name of this class's protocol, first one in case of + multiple), ``args`` (positional args, usually empty), and all other + keyword arguments as their own keys. + + Warnings + -------- + Serialized filesystems may contain sensitive information which have been + passed to the constructor, such as passwords and tokens. Make sure you + store and send them in a secure environment! + """ + from .json import FilesystemJSONEncoder + + json_encoder = FilesystemJSONEncoder() + + cls = type(self) + proto = self.protocol + + storage_options = dict(self.storage_options) + if not include_password: + storage_options.pop("password", None) + + return dict( + cls=f"{cls.__module__}:{cls.__name__}", + protocol=proto[0] if isinstance(proto, (tuple, list)) else proto, + args=json_encoder.make_serializable(self.storage_args), + **json_encoder.make_serializable(storage_options), + ) + + @staticmethod + def from_dict(dct: dict[str, Any]) -> AbstractFileSystem: + """ + Recreate a filesystem instance from dictionary representation. + + See ``.to_dict()`` for the expected structure of the input. + + Parameters + ---------- + dct: Dict[str, Any] + + Returns + ------- + file system instance, not necessarily of this particular class. + + Warnings + -------- + This can import arbitrary modules (as determined by the ``cls`` key). + Make sure you haven't installed any modules that may execute malicious code + at import time. + """ + from .json import FilesystemJSONDecoder + + json_decoder = FilesystemJSONDecoder() + + dct = dict(dct) # Defensive copy + + cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct) + if cls is None: + raise ValueError("Not a serialized AbstractFileSystem") + + dct.pop("cls", None) + dct.pop("protocol", None) + + return cls( + *json_decoder.unmake_serializable(dct.pop("args", ())), + **json_decoder.unmake_serializable(dct), + ) + + def _get_pyarrow_filesystem(self): + """ + Make a version of the FS instance which will be acceptable to pyarrow + """ + # all instances already also derive from pyarrow + return self + + def get_mapper(self, root="", check=False, create=False, missing_exceptions=None): + """Create key/value store based on this file-system + + Makes a MutableMapping interface to the FS at the given root path. + See ``fsspec.mapping.FSMap`` for further details. + """ + from .mapping import FSMap + + return FSMap( + root, + self, + check=check, + create=create, + missing_exceptions=missing_exceptions, + ) + + @classmethod + def clear_instance_cache(cls): + """ + Clear the cache of filesystem instances. + + Notes + ----- + Unless overridden by setting the ``cachable`` class attribute to False, + the filesystem class stores a reference to newly created instances. This + prevents Python's normal rules around garbage collection from working, + since the instances refcount will not drop to zero until + ``clear_instance_cache`` is called. + """ + cls._cache.clear() + + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + raise NotImplementedError + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + raise NotImplementedError + + def tree( + self, + path: str = "/", + recursion_limit: int = 2, + max_display: int = 25, + display_size: bool = False, + prefix: str = "", + is_last: bool = True, + first: bool = True, + indent_size: int = 4, + ) -> str: + """ + Return a tree-like structure of the filesystem starting from the given path as a string. + + Parameters + ---------- + path: Root path to start traversal from + recursion_limit: Maximum depth of directory traversal + max_display: Maximum number of items to display per directory + display_size: Whether to display file sizes + prefix: Current line prefix for visual tree structure + is_last: Whether current item is last in its level + first: Whether this is the first call (displays root path) + indent_size: Number of spaces by indent + + Returns + ------- + str: A string representing the tree structure. + + Example + ------- + >>> from fsspec import filesystem + + >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password') + >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10) + >>> print(tree) + """ + + def format_bytes(n: int) -> str: + """Format bytes as text.""" + for prefix, k in ( + ("P", 2**50), + ("T", 2**40), + ("G", 2**30), + ("M", 2**20), + ("k", 2**10), + ): + if n >= 0.9 * k: + return f"{n / k:.2f} {prefix}b" + return f"{n}B" + + result = [] + + if first: + result.append(path) + + if recursion_limit: + indent = " " * indent_size + contents = self.ls(path, detail=True) + contents.sort( + key=lambda x: (x.get("type") != "directory", x.get("name", "")) + ) + + if max_display is not None and len(contents) > max_display: + displayed_contents = contents[:max_display] + remaining_count = len(contents) - max_display + else: + displayed_contents = contents + remaining_count = 0 + + for i, item in enumerate(displayed_contents): + is_last_item = (i == len(displayed_contents) - 1) and ( + remaining_count == 0 + ) + + branch = ( + "└" + ("─" * (indent_size - 2)) + if is_last_item + else "├" + ("─" * (indent_size - 2)) + ) + branch += " " + new_prefix = prefix + ( + indent if is_last_item else "│" + " " * (indent_size - 1) + ) + + name = os.path.basename(item.get("name", "")) + + if display_size and item.get("type") == "directory": + sub_contents = self.ls(item.get("name", ""), detail=True) + num_files = sum( + 1 for sub_item in sub_contents if sub_item.get("type") == "file" + ) + num_folders = sum( + 1 + for sub_item in sub_contents + if sub_item.get("type") == "directory" + ) + + if num_files == 0 and num_folders == 0: + size = " (empty folder)" + elif num_files == 0: + size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})" + elif num_folders == 0: + size = f" ({num_files} file{'s' if num_files > 1 else ''})" + else: + size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})" + elif display_size and item.get("type") == "file": + size = f" ({format_bytes(item.get('size', 0))})" + else: + size = "" + + result.append(f"{prefix}{branch}{name}{size}") + + if item.get("type") == "directory" and recursion_limit > 0: + result.append( + self.tree( + path=item.get("name", ""), + recursion_limit=recursion_limit - 1, + max_display=max_display, + display_size=display_size, + prefix=new_prefix, + is_last=is_last_item, + first=False, + indent_size=indent_size, + ) + ) + + if remaining_count > 0: + more_message = f"{remaining_count} more item(s) not displayed." + result.append( + f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}" + ) + + return "\n".join(_ for _ in result if _) + + # ------------------------------------------------------------------------ + # Aliases + + def read_bytes(self, path, start=None, end=None, **kwargs): + """Alias of `AbstractFileSystem.cat_file`.""" + return self.cat_file(path, start=start, end=end, **kwargs) + + def write_bytes(self, path, value, **kwargs): + """Alias of `AbstractFileSystem.pipe_file`.""" + self.pipe_file(path, value, **kwargs) + + def makedir(self, path, create_parents=True, **kwargs): + """Alias of `AbstractFileSystem.mkdir`.""" + return self.mkdir(path, create_parents=create_parents, **kwargs) + + def mkdirs(self, path, exist_ok=False): + """Alias of `AbstractFileSystem.makedirs`.""" + return self.makedirs(path, exist_ok=exist_ok) + + def listdir(self, path, detail=True, **kwargs): + """Alias of `AbstractFileSystem.ls`.""" + return self.ls(path, detail=detail, **kwargs) + + def cp(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.copy`.""" + return self.copy(path1, path2, **kwargs) + + def move(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.mv`.""" + return self.mv(path1, path2, **kwargs) + + def stat(self, path, **kwargs): + """Alias of `AbstractFileSystem.info`.""" + return self.info(path, **kwargs) + + def disk_usage(self, path, total=True, maxdepth=None, **kwargs): + """Alias of `AbstractFileSystem.du`.""" + return self.du(path, total=total, maxdepth=maxdepth, **kwargs) + + def rename(self, path1, path2, **kwargs): + """Alias of `AbstractFileSystem.mv`.""" + return self.mv(path1, path2, **kwargs) + + def delete(self, path, recursive=False, maxdepth=None): + """Alias of `AbstractFileSystem.rm`.""" + return self.rm(path, recursive=recursive, maxdepth=maxdepth) + + def upload(self, lpath, rpath, recursive=False, **kwargs): + """Alias of `AbstractFileSystem.put`.""" + return self.put(lpath, rpath, recursive=recursive, **kwargs) + + def download(self, rpath, lpath, recursive=False, **kwargs): + """Alias of `AbstractFileSystem.get`.""" + return self.get(rpath, lpath, recursive=recursive, **kwargs) + + def sign(self, path, expiration=100, **kwargs): + """Create a signed URL representing the given path + + Some implementations allow temporary URLs to be generated, as a + way of delegating credentials. + + Parameters + ---------- + path : str + The path on the filesystem + expiration : int + Number of seconds to enable the URL for (if supported) + + Returns + ------- + URL : str + The signed URL + + Raises + ------ + NotImplementedError : if method is not implemented for a filesystem + """ + raise NotImplementedError("Sign is not implemented for this filesystem") + + def _isfilestore(self): + # Originally inherited from pyarrow DaskFileSystem. Keeping this + # here for backwards compatibility as long as pyarrow uses its + # legacy fsspec-compatible filesystems and thus accepts fsspec + # filesystems as well + return False + + +class AbstractBufferedFile(io.IOBase): + """Convenient class to derive from to provide buffering + + In the case that the backend does not provide a pythonic file-like object + already, this class contains much of the logic to build one. The only + methods that need to be overridden are ``_upload_chunk``, + ``_initiate_upload`` and ``_fetch_range``. + """ + + DEFAULT_BLOCK_SIZE = 5 * 2**20 + _details = None + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + size=None, + **kwargs, + ): + """ + Template for files with buffered reading and writing + + Parameters + ---------- + fs: instance of FileSystem + path: str + location in file-system + mode: str + Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file + systems may be read-only, and some may not support append. + block_size: int + Buffer size for reading or writing, 'default' for class default + autocommit: bool + Whether to write to final destination; may only impact what + happens when file is being closed. + cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" + Caching policy in read mode. See the definitions in ``core``. + cache_options : dict + Additional options passed to the constructor for the cache specified + by `cache_type`. + size: int + If given and in read mode, suppressed having to look up the file size + kwargs: + Gets stored as self.kwargs + """ + from .core import caches + + self.path = path + self.fs = fs + self.mode = mode + self.blocksize = ( + self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size + ) + self.loc = 0 + self.autocommit = autocommit + self.end = None + self.start = None + self.closed = False + + if cache_options is None: + cache_options = {} + + if "trim" in kwargs: + warnings.warn( + "Passing 'trim' to control the cache behavior has been deprecated. " + "Specify it within the 'cache_options' argument instead.", + FutureWarning, + ) + cache_options["trim"] = kwargs.pop("trim") + + self.kwargs = kwargs + + if mode not in {"ab", "rb", "wb", "xb"}: + raise NotImplementedError("File mode not supported") + if mode == "rb": + if size is not None: + self.size = size + else: + self.size = self.details["size"] + self.cache = caches[cache_type]( + self.blocksize, self._fetch_range, self.size, **cache_options + ) + else: + self.buffer = io.BytesIO() + self.offset = None + self.forced = False + self.location = None + + @property + def details(self): + if self._details is None: + self._details = self.fs.info(self.path) + return self._details + + @details.setter + def details(self, value): + self._details = value + self.size = value["size"] + + @property + def full_name(self): + return _unstrip_protocol(self.path, self.fs) + + @property + def closed(self): + # get around this attr being read-only in IOBase + # use getattr here, since this can be called during del + return getattr(self, "_closed", True) + + @closed.setter + def closed(self, c): + self._closed = c + + def __hash__(self): + if "w" in self.mode: + return id(self) + else: + return int(tokenize(self.details), 16) + + def __eq__(self, other): + """Files are equal if they have the same checksum, only in read mode""" + if self is other: + return True + return ( + isinstance(other, type(self)) + and self.mode == "rb" + and other.mode == "rb" + and hash(self) == hash(other) + ) + + def commit(self): + """Move from temp to final destination""" + + def discard(self): + """Throw away temporary file""" + + def info(self): + """File information about this path""" + if self.readable(): + return self.details + else: + raise ValueError("Info not available while writing") + + def tell(self): + """Current file location""" + return self.loc + + def seek(self, loc, whence=0): + """Set current file location + + Parameters + ---------- + loc: int + byte location + whence: {0, 1, 2} + from start of file, current location or end of file, resp. + """ + loc = int(loc) + if not self.mode == "rb": + raise OSError(ESPIPE, "Seek only available in read mode") + if whence == 0: + nloc = loc + elif whence == 1: + nloc = self.loc + loc + elif whence == 2: + nloc = self.size + loc + else: + raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)") + if nloc < 0: + raise ValueError("Seek before start of file") + self.loc = nloc + return self.loc + + def write(self, data): + """ + Write data to buffer. + + Buffer only sent on flush() or if buffer is greater than + or equal to blocksize. + + Parameters + ---------- + data: bytes + Set of bytes to be written. + """ + if not self.writable(): + raise ValueError("File not in write mode") + if self.closed: + raise ValueError("I/O operation on closed file.") + if self.forced: + raise ValueError("This file has been force-flushed, can only close") + out = self.buffer.write(data) + self.loc += out + if self.buffer.tell() >= self.blocksize: + self.flush() + return out + + def flush(self, force=False): + """ + Write buffered data to backend store. + + Writes the current buffer, if it is larger than the block-size, or if + the file is being closed. + + Parameters + ---------- + force: bool + When closing, write the last block even if it is smaller than + blocks are allowed to be. Disallows further writing to this file. + """ + + if self.closed: + raise ValueError("Flush on closed file") + if force and self.forced: + raise ValueError("Force flush cannot be called more than once") + if force: + self.forced = True + + if self.readable(): + # no-op to flush on read-mode + return + + if not force and self.buffer.tell() < self.blocksize: + # Defer write on small block + return + + if self.offset is None: + # Initialize a multipart upload + self.offset = 0 + try: + self._initiate_upload() + except: + self.closed = True + raise + + if self._upload_chunk(final=force) is not False: + self.offset += self.buffer.seek(0, 2) + self.buffer = io.BytesIO() + + def _upload_chunk(self, final=False): + """Write one part of a multi-block file upload + + Parameters + ========== + final: bool + This is the last block, so should complete file, if + self.autocommit is True. + """ + # may not yet have been initialized, may need to call _initialize_upload + + def _initiate_upload(self): + """Create remote file/upload""" + pass + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + return self.fs.cat_file(self.path, start=start, end=end) + + def read(self, length=-1): + """ + Return data from cache, or fetch pieces as necessary + + Parameters + ---------- + length: int (-1) + Number of bytes to read; if <0, all remaining bytes. + """ + length = -1 if length is None else int(length) + if self.mode != "rb": + raise ValueError("File not in read mode") + if length < 0: + length = self.size - self.loc + if self.closed: + raise ValueError("I/O operation on closed file.") + if length == 0: + # don't even bother calling fetch + return b"" + out = self.cache._fetch(self.loc, self.loc + length) + + logger.debug( + "%s read: %i - %i %s", + self, + self.loc, + self.loc + length, + self.cache._log_stats(), + ) + self.loc += len(out) + return out + + def readinto(self, b): + """mirrors builtin file's readinto method + + https://docs.python.org/3/library/io.html#io.RawIOBase.readinto + """ + out = memoryview(b).cast("B") + data = self.read(out.nbytes) + out[: len(data)] = data + return len(data) + + def readuntil(self, char=b"\n", blocks=None): + """Return data between current position and first occurrence of char + + char is included in the output, except if the end of the tile is + encountered first. + + Parameters + ---------- + char: bytes + Thing to find + blocks: None or int + How much to read in each go. Defaults to file blocksize - which may + mean a new read on every call. + """ + out = [] + while True: + start = self.tell() + part = self.read(blocks or self.blocksize) + if len(part) == 0: + break + found = part.find(char) + if found > -1: + out.append(part[: found + len(char)]) + self.seek(start + found + len(char)) + break + out.append(part) + return b"".join(out) + + def readline(self): + """Read until and including the first occurrence of newline character + + Note that, because of character encoding, this is not necessarily a + true line ending. + """ + return self.readuntil(b"\n") + + def __next__(self): + out = self.readline() + if out: + return out + raise StopIteration + + def __iter__(self): + return self + + def readlines(self): + """Return all data, split by the newline character, including the newline character""" + data = self.read() + lines = data.split(b"\n") + out = [l + b"\n" for l in lines[:-1]] + if data.endswith(b"\n"): + return out + else: + return out + [lines[-1]] + # return list(self) ??? + + def readinto1(self, b): + return self.readinto(b) + + def close(self): + """Close file + + Finalizes writes, discards cache + """ + if getattr(self, "_unclosable", False): + return + if self.closed: + return + try: + if self.mode == "rb": + self.cache = None + else: + if not self.forced: + self.flush(force=True) + + if self.fs is not None: + self.fs.invalidate_cache(self.path) + self.fs.invalidate_cache(self.fs._parent(self.path)) + finally: + self.closed = True + + def readable(self): + """Whether opened for reading""" + return "r" in self.mode and not self.closed + + def seekable(self): + """Whether is seekable (only in read mode)""" + return self.readable() + + def writable(self): + """Whether opened for writing""" + return self.mode in {"wb", "ab", "xb"} and not self.closed + + def __reduce__(self): + if self.mode != "rb": + raise RuntimeError("Pickling a writeable file is not supported") + + return reopen, ( + self.fs, + self.path, + self.mode, + self.blocksize, + self.loc, + self.size, + self.autocommit, + self.cache.name if self.cache else "none", + self.kwargs, + ) + + def __del__(self): + if not self.closed: + self.close() + + def __str__(self): + return f"" + + __repr__ = __str__ + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + +def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs): + file = fs.open( + path, + mode=mode, + block_size=blocksize, + autocommit=autocommit, + cache_type=cache_type, + size=size, + **kwargs, + ) + if loc > 0: + file.seek(loc) + return file diff --git a/.venv/lib/python3.11/site-packages/fsspec/transaction.py b/.venv/lib/python3.11/site-packages/fsspec/transaction.py new file mode 100644 index 0000000000000000000000000000000000000000..77293f63ecc5f611e19d849ef236d53e9c258efc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/transaction.py @@ -0,0 +1,90 @@ +from collections import deque + + +class Transaction: + """Filesystem transaction write context + + Gathers files for deferred commit or discard, so that several write + operations can be finalized semi-atomically. This works by having this + instance as the ``.transaction`` attribute of the given filesystem + """ + + def __init__(self, fs, **kwargs): + """ + Parameters + ---------- + fs: FileSystem instance + """ + self.fs = fs + self.files = deque() + + def __enter__(self): + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """End transaction and commit, if exit is not due to exception""" + # only commit if there was no exception + self.complete(commit=exc_type is None) + if self.fs: + self.fs._intrans = False + self.fs._transaction = None + self.fs = None + + def start(self): + """Start a transaction on this FileSystem""" + self.files = deque() # clean up after previous failed completions + self.fs._intrans = True + + def complete(self, commit=True): + """Finish transaction: commit or discard all deferred files""" + while self.files: + f = self.files.popleft() + if commit: + f.commit() + else: + f.discard() + self.fs._intrans = False + self.fs._transaction = None + self.fs = None + + +class FileActor: + def __init__(self): + self.files = [] + + def commit(self): + for f in self.files: + f.commit() + self.files.clear() + + def discard(self): + for f in self.files: + f.discard() + self.files.clear() + + def append(self, f): + self.files.append(f) + + +class DaskTransaction(Transaction): + def __init__(self, fs): + """ + Parameters + ---------- + fs: FileSystem instance + """ + import distributed + + super().__init__(fs) + client = distributed.default_client() + self.files = client.submit(FileActor, actor=True).result() + + def complete(self, commit=True): + """Finish transaction: commit or discard all deferred files""" + if commit: + self.files.commit().result() + else: + self.files.discard().result() + self.fs._intrans = False + self.fs = None diff --git a/.venv/lib/python3.11/site-packages/fsspec/utils.py b/.venv/lib/python3.11/site-packages/fsspec/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..917af8d19c560212a9757d11a16213f54ec77356 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/utils.py @@ -0,0 +1,739 @@ +from __future__ import annotations + +import contextlib +import logging +import math +import os +import re +import sys +import tempfile +from functools import partial +from hashlib import md5 +from importlib.metadata import version +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Iterable, + Iterator, + Sequence, + TypeVar, +) +from urllib.parse import urlsplit + +if TYPE_CHECKING: + import pathlib + + from typing_extensions import TypeGuard + + from fsspec.spec import AbstractFileSystem + + +DEFAULT_BLOCK_SIZE = 5 * 2**20 + +T = TypeVar("T") + + +def infer_storage_options( + urlpath: str, inherit_storage_options: dict[str, Any] | None = None +) -> dict[str, Any]: + """Infer storage options from URL path and merge it with existing storage + options. + + Parameters + ---------- + urlpath: str or unicode + Either local absolute file path or URL (hdfs://namenode:8020/file.csv) + inherit_storage_options: dict (optional) + Its contents will get merged with the inferred information from the + given path + + Returns + ------- + Storage options dict. + + Examples + -------- + >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP + {"protocol": "file", "path", "/mnt/datasets/test.csv"} + >>> infer_storage_options( + ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', + ... inherit_storage_options={'extra': 'value'}, + ... ) # doctest: +SKIP + {"protocol": "hdfs", "username": "username", "password": "pwd", + "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", + "url_query": "q=1", "extra": "value"} + """ + # Handle Windows paths including disk name in this special case + if ( + re.match(r"^[a-zA-Z]:[\\/]", urlpath) + or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None + ): + return {"protocol": "file", "path": urlpath} + + parsed_path = urlsplit(urlpath) + protocol = parsed_path.scheme or "file" + if parsed_path.fragment: + path = "#".join([parsed_path.path, parsed_path.fragment]) + else: + path = parsed_path.path + if protocol == "file": + # Special case parsing file protocol URL on Windows according to: + # https://msdn.microsoft.com/en-us/library/jj710207.aspx + windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) + if windows_path: + drive, path = windows_path.groups() + path = f"{drive}:{path}" + + if protocol in ["http", "https"]: + # for HTTP, we don't want to parse, as requests will anyway + return {"protocol": protocol, "path": urlpath} + + options: dict[str, Any] = {"protocol": protocol, "path": path} + + if parsed_path.netloc: + # Parse `hostname` from netloc manually because `parsed_path.hostname` + # lowercases the hostname which is not always desirable (e.g. in S3): + # https://github.com/dask/dask/issues/1417 + options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] + + if protocol in ("s3", "s3a", "gcs", "gs"): + options["path"] = options["host"] + options["path"] + else: + options["host"] = options["host"] + if parsed_path.port: + options["port"] = parsed_path.port + if parsed_path.username: + options["username"] = parsed_path.username + if parsed_path.password: + options["password"] = parsed_path.password + + if parsed_path.query: + options["url_query"] = parsed_path.query + if parsed_path.fragment: + options["url_fragment"] = parsed_path.fragment + + if inherit_storage_options: + update_storage_options(options, inherit_storage_options) + + return options + + +def update_storage_options( + options: dict[str, Any], inherited: dict[str, Any] | None = None +) -> None: + if not inherited: + inherited = {} + collisions = set(options) & set(inherited) + if collisions: + for collision in collisions: + if options.get(collision) != inherited.get(collision): + raise KeyError( + f"Collision between inferred and specified storage " + f"option:\n{collision}" + ) + options.update(inherited) + + +# Compression extensions registered via fsspec.compression.register_compression +compressions: dict[str, str] = {} + + +def infer_compression(filename: str) -> str | None: + """Infer compression, if available, from filename. + + Infer a named compression type, if registered and available, from filename + extension. This includes builtin (gz, bz2, zip) compressions, as well as + optional compressions. See fsspec.compression.register_compression. + """ + extension = os.path.splitext(filename)[-1].strip(".").lower() + if extension in compressions: + return compressions[extension] + return None + + +def build_name_function(max_int: float) -> Callable[[int], str]: + """Returns a function that receives a single integer + and returns it as a string padded by enough zero characters + to align with maximum possible integer + + >>> name_f = build_name_function(57) + + >>> name_f(7) + '07' + >>> name_f(31) + '31' + >>> build_name_function(1000)(42) + '0042' + >>> build_name_function(999)(42) + '042' + >>> build_name_function(0)(0) + '0' + """ + # handle corner cases max_int is 0 or exact power of 10 + max_int += 1e-8 + + pad_length = int(math.ceil(math.log10(max_int))) + + def name_function(i: int) -> str: + return str(i).zfill(pad_length) + + return name_function + + +def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool: + r"""Seek current file to file start, file end, or byte after delimiter seq. + + Seeks file to next chunk delimiter, where chunks are defined on file start, + a delimiting sequence, and file end. Use file.tell() to see location afterwards. + Note that file start is a valid split, so must be at offset > 0 to seek for + delimiter. + + Parameters + ---------- + file: a file + delimiter: bytes + a delimiter like ``b'\n'`` or message sentinel, matching file .read() type + blocksize: int + Number of bytes to read from the file at once. + + + Returns + ------- + Returns True if a delimiter was found, False if at file start or end. + + """ + + if file.tell() == 0: + # beginning-of-file, return without seek + return False + + # Interface is for binary IO, with delimiter as bytes, but initialize last + # with result of file.read to preserve compatibility with text IO. + last: bytes | None = None + while True: + current = file.read(blocksize) + if not current: + # end-of-file without delimiter + return False + full = last + current if last else current + try: + if delimiter in full: + i = full.index(delimiter) + file.seek(file.tell() - (len(full) - i) + len(delimiter)) + return True + elif len(current) < blocksize: + # end-of-file without delimiter + return False + except (OSError, ValueError): + pass + last = full[-len(delimiter) :] + + +def read_block( + f: IO[bytes], + offset: int, + length: int | None, + delimiter: bytes | None = None, + split_before: bool = False, +) -> bytes: + """Read a block of bytes from a file + + Parameters + ---------- + f: File + Open file + offset: int + Byte offset to start read + length: int + Number of bytes to read, read through end of file if None + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + split_before: bool (optional) + Start/stop read *before* delimiter bytestring. + + + If using the ``delimiter=`` keyword argument we ensure that the read + starts and stops at delimiter boundaries that follow the locations + ``offset`` and ``offset + length``. If ``offset`` is zero then we + start at zero, regardless of delimiter. The bytestring returned WILL + include the terminating delimiter string. + + Examples + -------- + + >>> from io import BytesIO # doctest: +SKIP + >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP + >>> read_block(f, 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + + >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP + b'Bob, 200\\nCharlie, 300' + """ + if delimiter: + f.seek(offset) + found_start_delim = seek_delimiter(f, delimiter, 2**16) + if length is None: + return f.read() + start = f.tell() + length -= start - offset + + f.seek(start + length) + found_end_delim = seek_delimiter(f, delimiter, 2**16) + end = f.tell() + + # Adjust split location to before delimiter if seek found the + # delimiter sequence, not start or end of file. + if found_start_delim and split_before: + start -= len(delimiter) + + if found_end_delim and split_before: + end -= len(delimiter) + + offset = start + length = end - start + + f.seek(offset) + + # TODO: allow length to be None and read to the end of the file? + assert length is not None + b = f.read(length) + return b + + +def tokenize(*args: Any, **kwargs: Any) -> str: + """Deterministic token + + (modified from dask.base) + + >>> tokenize([1, 2, '3']) + '9d71491b50023b06fc76928e6eddb952' + + >>> tokenize('Hello') == tokenize('Hello') + True + """ + if kwargs: + args += (kwargs,) + try: + h = md5(str(args).encode()) + except ValueError: + # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380 + h = md5(str(args).encode(), usedforsecurity=False) + return h.hexdigest() + + +def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str: + """Attempt to convert a path-like object to a string. + + Parameters + ---------- + filepath: object to be converted + + Returns + ------- + filepath_str: maybe a string version of the object + + Notes + ----- + Objects supporting the fspath protocol are coerced according to its + __fspath__ method. + + For backwards compatibility with older Python version, pathlib.Path + objects are specially coerced. + + Any other object is passed through unchanged, which includes bytes, + strings, buffers, or anything else that's not even path-like. + """ + if isinstance(filepath, str): + return filepath + elif hasattr(filepath, "__fspath__"): + return filepath.__fspath__() + elif hasattr(filepath, "path"): + return filepath.path + else: + return filepath # type: ignore[return-value] + + +def make_instance( + cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any] +) -> T: + inst = cls(*args, **kwargs) + inst._determine_worker() # type: ignore[attr-defined] + return inst + + +def common_prefix(paths: Iterable[str]) -> str: + """For a list of paths, find the shortest prefix common to all""" + parts = [p.split("/") for p in paths] + lmax = min(len(p) for p in parts) + end = 0 + for i in range(lmax): + end = all(p[i] == parts[0][i] for p in parts) + if not end: + break + i += end + return "/".join(parts[0][:i]) + + +def other_paths( + paths: list[str], + path2: str | list[str], + exists: bool = False, + flatten: bool = False, +) -> list[str]: + """In bulk file operations, construct a new file tree from a list of files + + Parameters + ---------- + paths: list of str + The input file tree + path2: str or list of str + Root to construct the new list in. If this is already a list of str, we just + assert it has the right number of elements. + exists: bool (optional) + For a str destination, it is already exists (and is a dir), files should + end up inside. + flatten: bool (optional) + Whether to flatten the input directory tree structure so that the output files + are in the same directory. + + Returns + ------- + list of str + """ + + if isinstance(path2, str): + path2 = path2.rstrip("/") + + if flatten: + path2 = ["/".join((path2, p.split("/")[-1])) for p in paths] + else: + cp = common_prefix(paths) + if exists: + cp = cp.rsplit("/", 1)[0] + if not cp and all(not s.startswith("/") for s in paths): + path2 = ["/".join([path2, p]) for p in paths] + else: + path2 = [p.replace(cp, path2, 1) for p in paths] + else: + assert len(paths) == len(path2) + return path2 + + +def is_exception(obj: Any) -> bool: + return isinstance(obj, BaseException) + + +def isfilelike(f: Any) -> TypeGuard[IO[bytes]]: + return all(hasattr(f, attr) for attr in ["read", "close", "tell"]) + + +def get_protocol(url: str) -> str: + url = stringify_path(url) + parts = re.split(r"(\:\:|\://)", url, maxsplit=1) + if len(parts) > 1: + return parts[0] + return "file" + + +def can_be_local(path: str) -> bool: + """Can the given URL be used with open_local?""" + from fsspec import get_filesystem_class + + try: + return getattr(get_filesystem_class(get_protocol(path)), "local_file", False) + except (ValueError, ImportError): + # not in registry or import failed + return False + + +def get_package_version_without_import(name: str) -> str | None: + """For given package name, try to find the version without importing it + + Import and package.__version__ is still the backup here, so an import + *might* happen. + + Returns either the version string, or None if the package + or the version was not readily found. + """ + if name in sys.modules: + mod = sys.modules[name] + if hasattr(mod, "__version__"): + return mod.__version__ + try: + return version(name) + except: # noqa: E722 + pass + try: + import importlib + + mod = importlib.import_module(name) + return mod.__version__ + except (ImportError, AttributeError): + return None + + +def setup_logging( + logger: logging.Logger | None = None, + logger_name: str | None = None, + level: str = "DEBUG", + clear: bool = True, +) -> logging.Logger: + if logger is None and logger_name is None: + raise ValueError("Provide either logger object or logger name") + logger = logger or logging.getLogger(logger_name) + handle = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s" + ) + handle.setFormatter(formatter) + if clear: + logger.handlers.clear() + logger.addHandler(handle) + logger.setLevel(level) + return logger + + +def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str: + return fs.unstrip_protocol(name) + + +def mirror_from( + origin_name: str, methods: Iterable[str] +) -> Callable[[type[T]], type[T]]: + """Mirror attributes and methods from the given + origin_name attribute of the instance to the + decorated class""" + + def origin_getter(method: str, self: Any) -> Any: + origin = getattr(self, origin_name) + return getattr(origin, method) + + def wrapper(cls: type[T]) -> type[T]: + for method in methods: + wrapped_method = partial(origin_getter, method) + setattr(cls, method, property(wrapped_method)) + return cls + + return wrapper + + +@contextlib.contextmanager +def nullcontext(obj: T) -> Iterator[T]: + yield obj + + +def merge_offset_ranges( + paths: list[str], + starts: list[int] | int, + ends: list[int] | int, + max_gap: int = 0, + max_block: int | None = None, + sort: bool = True, +) -> tuple[list[str], list[int], list[int]]: + """Merge adjacent byte-offset ranges when the inter-range + gap is <= `max_gap`, and when the merged byte range does not + exceed `max_block` (if specified). By default, this function + will re-order the input paths and byte ranges to ensure sorted + order. If the user can guarantee that the inputs are already + sorted, passing `sort=False` will skip the re-ordering. + """ + # Check input + if not isinstance(paths, list): + raise TypeError + if not isinstance(starts, list): + starts = [starts] * len(paths) + if not isinstance(ends, list): + ends = [ends] * len(paths) + if len(starts) != len(paths) or len(ends) != len(paths): + raise ValueError + + # Early Return + if len(starts) <= 1: + return paths, starts, ends + + starts = [s or 0 for s in starts] + # Sort by paths and then ranges if `sort=True` + if sort: + paths, starts, ends = ( + list(v) + for v in zip( + *sorted( + zip(paths, starts, ends), + ) + ) + ) + + if paths: + # Loop through the coupled `paths`, `starts`, and + # `ends`, and merge adjacent blocks when appropriate + new_paths = paths[:1] + new_starts = starts[:1] + new_ends = ends[:1] + for i in range(1, len(paths)): + if paths[i] == paths[i - 1] and new_ends[-1] is None: + continue + elif ( + paths[i] != paths[i - 1] + or ((starts[i] - new_ends[-1]) > max_gap) + or (max_block is not None and (ends[i] - new_starts[-1]) > max_block) + ): + # Cannot merge with previous block. + # Add new `paths`, `starts`, and `ends` elements + new_paths.append(paths[i]) + new_starts.append(starts[i]) + new_ends.append(ends[i]) + else: + # Merge with previous block by updating the + # last element of `ends` + new_ends[-1] = ends[i] + return new_paths, new_starts, new_ends + + # `paths` is empty. Just return input lists + return paths, starts, ends + + +def file_size(filelike: IO[bytes]) -> int: + """Find length of any open read-mode file-like""" + pos = filelike.tell() + try: + return filelike.seek(0, 2) + finally: + filelike.seek(pos) + + +@contextlib.contextmanager +def atomic_write(path: str, mode: str = "wb"): + """ + A context manager that opens a temporary file next to `path` and, on exit, + replaces `path` with the temporary file, thereby updating `path` + atomically. + """ + fd, fn = tempfile.mkstemp( + dir=os.path.dirname(path), prefix=os.path.basename(path) + "-" + ) + try: + with open(fd, mode) as fp: + yield fp + except BaseException: + with contextlib.suppress(FileNotFoundError): + os.unlink(fn) + raise + else: + os.replace(fn, path) + + +def _translate(pat, STAR, QUESTION_MARK): + # Copied from: https://github.com/python/cpython/pull/106703. + res: list[str] = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i + 1 + if c == "*": + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == "?": + add(QUESTION_MARK) + elif c == "[": + j = i + if j < n and pat[j] == "!": + j = j + 1 + if j < n and pat[j] == "]": + j = j + 1 + while j < n and pat[j] != "]": + j = j + 1 + if j >= n: + add("\\[") + else: + stuff = pat[i:j] + if "-" not in stuff: + stuff = stuff.replace("\\", r"\\") + else: + chunks = [] + k = i + 2 if pat[i] == "!" else i + 1 + while True: + k = pat.find("-", k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k + 1 + k = k + 3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += "-" + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks) - 1, 0, -1): + if chunks[k - 1][-1] > chunks[k][0]: + chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = "-".join( + s.replace("\\", r"\\").replace("-", r"\-") for s in chunks + ) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r"([&~|])", r"\\\1", stuff) + i = j + 1 + if not stuff: + # Empty range: never match. + add("(?!)") + elif stuff == "!": + # Negated empty range: match any character. + add(".") + else: + if stuff[0] == "!": + stuff = "^" + stuff[1:] + elif stuff[0] in ("^", "["): + stuff = "\\" + stuff + add(f"[{stuff}]") + else: + add(re.escape(c)) + assert i == n + return res + + +def glob_translate(pat): + # Copied from: https://github.com/python/cpython/pull/106703. + # The keyword parameters' values are fixed to: + # recursive=True, include_hidden=True, seps=None + """Translate a pathname with shell wildcards to a regular expression.""" + if os.path.altsep: + seps = os.path.sep + os.path.altsep + else: + seps = os.path.sep + escaped_seps = "".join(map(re.escape, seps)) + any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps + not_sep = f"[^{escaped_seps}]" + one_last_segment = f"{not_sep}+" + one_segment = f"{one_last_segment}{any_sep}" + any_segments = f"(?:.+{any_sep})?" + any_last_segments = ".*" + results = [] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if part == "*": + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue + if part == "**": + results.append(any_segments if idx < last_part_idx else any_last_segments) + continue + elif "**" in part: + raise ValueError( + "Invalid pattern: '**' can only be an entire path component" + ) + if part: + results.extend(_translate(part, f"{not_sep}*", not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = "".join(results) + return rf"(?s:{res})\Z" diff --git a/.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cee75f44126f5115f6000e9cf2449673857f09a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/__init__.py b/.venv/lib/python3.11/site-packages/functorch/_src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c3a8f7adadc3f73bbf688b8ce765285788303c9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py b/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..94f258df84ba8730208768fc44222bee4b3ebc33 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py @@ -0,0 +1,8 @@ +# This file has moved to under torch/_functorch. It is not public API. +# If you are not a PyTorch developer and you are relying on the following +# imports, please file an issue. +from torch._functorch.aot_autograd import ( + aot_autograd_decompositions, + KNOWN_TYPES, + PytreeThunk, +) diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..defc7ffeacc51d925c3515de3ca74bceb50cc33c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py b/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6052b5548f4af3dbc6d9d45b0ffe72a8d5013d41 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py @@ -0,0 +1,7 @@ +# This file has moved to under torch/_functorch. It is not public API. +# If you are not a PyTorch developer and you are relying on the following +# imports, please file an issue. +from torch._functorch.eager_transforms import ( + _assert_wrapped_functional, + _unwrap_functional_tensor, +) diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a6d61e909b945e4e8837f5dd2409ca0a3c5a8be Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py b/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3de7787df0c3304207b42b51e9fb62da9d33c7d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py @@ -0,0 +1,4 @@ +# This file has moved to under torch/_functorch. It is not public API. +# If you are not a PyTorch developer and you are relying on the following +# imports, please file an issue. +from torch._functorch.make_functional import _swap_state diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f01cc206ca3a333e6d9bbb6f862780e3c9203ce Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py b/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc90517753e50f92362ba954248e31f69f7cfcd5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py @@ -0,0 +1,16 @@ +# This file has moved to under torch/_functorch. It is not public API. +# If you are not a PyTorch developer and you are relying on the following +# imports, please file an issue. +from torch._functorch.vmap import ( + _add_batch_dim, + _broadcast_to_and_flatten, + _create_batched_inputs, + _get_name, + _process_batched_inputs, + _remove_batch_dim, + _unwrap_batched, + _validate_and_get_batch_size, + Tensor, + tree_flatten, + tree_unflatten, +) diff --git a/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e041a47b3bc933a213d41ffe26f8fb1a96bb97b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/compile/__init__.py b/.venv/lib/python3.11/site-packages/functorch/compile/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e7548a5ff6b91bae4fa561f0de7ad5d3492eda05 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/compile/__init__.py @@ -0,0 +1,30 @@ +from torch._functorch import config +from torch._functorch.aot_autograd import ( + aot_function, + aot_module, + aot_module_simplified, + compiled_function, + compiled_module, + get_aot_compilation_context, + get_aot_graph_name, + get_graph_being_compiled, + make_boxed_compiler, + make_boxed_func, +) +from torch._functorch.compilers import ( + debug_compile, + default_decompositions, + draw_graph_compile, + memory_efficient_fusion, + nnc_jit, + nop, + print_compile, + ts_compile, +) +from torch._functorch.fx_minifier import minifier +from torch._functorch.partitioners import ( + default_partition, + draw_graph, + min_cut_rematerialization_partition, +) +from torch._functorch.python_key import pythonkey_decompose diff --git a/.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be57d2d487eee870712174bce9d28342c3ea7cf5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__init__.py b/.venv/lib/python3.11/site-packages/functorch/dim/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a6de6ad59e9501b13cf5ba934c810e53ad61f6a3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/__init__.py @@ -0,0 +1,181 @@ +import dis +import inspect +from typing import Sequence, Union + +import functorch._C +import torch +from functorch._C import dim as _C + +from .tree_map import tree_flatten, tree_map +from .wrap_type import wrap_type + + +_C._patch_tensor_class() +dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists + + +class DimensionMismatchError(Exception): + pass + + +class DimensionBindError(Exception): + pass + + +from . import op_properties + + +# use dict to avoid writing C++ bindings for set +pointwise = dict.fromkeys(op_properties.pointwise, True) + +use_c = True +if not use_c: + from . import reference + + +class _Tensor: + # fast path around slow wrapping/unwrapping logic for simply queries used + # by the implementation... + + @property + def dims(self): + return tuple(d for d in self._levels if isinstance(d, Dim)) + + def dim(self): + return self.ndim + + if use_c: + __torch_function__ = classmethod(_C.__torch_function__) + expand = _C._instancemethod(_C.expand) + else: + __torch_function__ = reference.__torch_function__ + expand = reference.expand + + index = _C._instancemethod(_C.index) + + def __repr__(self): + tensor, levels, ndim = self._tensor, self._levels, self.ndim + return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}" + + +TensorLike = (_Tensor, torch.Tensor) + + +class Dim(_C.Dim, _Tensor): + # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precendence. + # Tensor defines format, but we want to print Dims with special formatting + __format__ = object.__format__ + + +class Tensor(_Tensor, _C.Tensor): + if not use_c: + from_batched = staticmethod(_C.Tensor_from_batched) + from_positional = staticmethod(_C.Tensor_from_positional) + sum = _C._instancemethod(_C.Tensor_sum) + + +def cat(tensors, dim, new_dim): + n = dims() + return stack(tensors, n, dim).index([n, dim], new_dim) + + +if use_c: + _wrap = _C._wrap + + def _def(name, *args, **kwargs): + orig = getattr(torch.Tensor, name) + setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs))) + + t__getitem__ = _C._instancemethod(_C.__getitem__) + stack = _C.stack + split = _C._instancemethod(_C.split) +else: + _wrap, _def = reference._wrap, reference._def + t__getitem__ = reference.t__getitem__ + stack = reference.stack + split = reference.split + +# note: there is no python reference +t__setitem__ = _C._instancemethod(_C.__setitem__) +# this is patched in the C API because otherwise torch.Tensor will +# no longer be considered a sequence and things will break +# torch.Tensor.__getitem__ = t__getitem__ + +_Tensor.__getitem__ = t__getitem__ +# torch.Tensor.__setitem__ = t__setitem__ +_Tensor.__setitem__ = t__setitem__ + +torch.Tensor.split = split +_Tensor.split = split +torch.Tensor.expand = _C._instancemethod(_C.expand) +torch.Tensor.index = _C._instancemethod(_C.index) +wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__) +del _Tensor.ndim + +if use_c: + _Tensor.order = _C._instancemethod(_C.order) +else: + _Tensor.order = reference.positional + +_def("mean") +_def("sum") +_def("all") +_def("amax") +_def("amin") +_def("aminmax") +_def("any") +_def("count_nonzero") +_def("logsumexp") +_def("nanmean") +_def("nansum") +_def("prod") +_def("std", keepdim_offset=2) +_def("var", keepdim_offset=2) +_def("max", single_dim=True) +_def("min", single_dim=True) +_def("argmax", single_dim=True) +_def("argmin", single_dim=True) +_def("kthvalue", single_dim=True) +_def("median", single_dim=True) +_def("nanmedian", single_dim=True) +_def("mode", single_dim=True) +_def("sort", reduce=False) +_def("argsort", reduce=False) +_def("unbind", single_dim=True) +_def("chunk", dim_offset=1, reduce=False) +_def("cummax", single_dim=True, reduce=False) +_def("cummin", single_dim=True, reduce=False) +_def("cumprod", single_dim=True, reduce=False) +_def("cumprod_", single_dim=True, reduce=False) +_def("cumsum", single_dim=True, reduce=False) +_def("cumsum_", single_dim=True, reduce=False) +_def("logcumsumexp", single_dim=True, reduce=False) +_def("renorm", dim_offset=1, single_dim=True, reduce=False) +_def("softmax", single_dim=True, reduce=False) +softmax = _wrap(torch.nn.functional.softmax, single_dim=True, reduce=False) + +# stuff to handle in the future, because they require special +# binding logic for dims +# cross +# diag_embed +# diagonal +# diagonal_scatter +# diff +# nanquantile +# quantile +# roll +# rot90 +# topk (new dimes on output) +# should these all be subsumed by inplace indexing? +# index_add_ +# index_add +# index_copy +# index_copy_ +# index_fill +# index_fill_ +# index_select +# scatter +# scatter_ +# scatter_add +# scatter_add_ +# scatter_reduce diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efddbc3d1e65baa4b021b977eba384d9f6ee594b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bab4b883b1f03a770bd58586ee75559a7bb6e23 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d5506751768e7f5202bd932082739e042284b9c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c199da6dcebd8cf52b0932e0d6d439ca647c0b4a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c54af62a25b464bc8d1a7a2eee155ad1e82674c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/op_properties.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/op_properties.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b93d3b7cafc04cabc40dccb3977fb978567d4e7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/op_properties.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/reference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/reference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a8c8996c2e5102a65f153503aa5a58c6ca8ec7a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/reference.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/tree_map.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/tree_map.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..daddf32ac83367bcc91a607bbeff1a08530c7f1f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/tree_map.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/wrap_type.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/wrap_type.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c431e1e329fac82f9e1d59d27fb14981b7b8581 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/wrap_type.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/batch_tensor.py b/.venv/lib/python3.11/site-packages/functorch/dim/batch_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..dae9b270896e988151741409b666a82ba1aba5a4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/batch_tensor.py @@ -0,0 +1,26 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from contextlib import contextmanager + +from torch._C._functorch import _vmap_add_layers, _vmap_remove_layers + + +_enabled = False + + +@contextmanager +def _enable_layers(dims): + global _enabled + assert not _enabled + input = sorted((d._level, d.size) for d in dims if not isinstance(d, int)) + n = len(input) + try: + _vmap_add_layers(input) + _enabled = True + yield + finally: + _enabled = False + _vmap_remove_layers(n) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py b/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..3984a063885907141b56bdd2c6e8cc730c592cbb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from . import _Tensor, Tensor +from .reference import _dims, _enable_layers, llist, ltuple + + +class DelayedMulTensor(_Tensor): + def __init__(self, lhs, rhs): + self._lhs, self._rhs = lhs, rhs + self._data = None + self._levels_data = None + self._has_device = lhs._has_device or rhs._has_device + self._batchtensor_data = None + self._tensor_data = None + + @property + def _levels(self): + if self._levels_data is None: + levels = llist(self._lhs._levels) + for l in self._rhs._levels: + if l not in levels: + levels.append(l) + self._levels_data = ltuple(levels) + return self._levels_data + + @property + def _batchtensor(self): + if self._batchtensor_data is None: + with _enable_layers(self._levels): + print("bt multiply fallback") + self._batchtensor_data = self._lhs._batchtensor * self._rhs._batchtensor + return self._batchtensor_data + + @property + def _tensor(self): + if self._tensor_data is None: + self._tensor_data = Tensor.from_batched( + self._batchtensor, self._has_device + )._tensor + return self._tensor_data + + @property + def ndim(self): + return self._batchtensor.ndim + + @property + def dims(self): + return ltuple(super().dims) + + def sum(self, dim): + dims = _dims(dim, 0, False, False) + n = ord("a") + all_levels = self._levels + + def to_char(d): + return chr(n + all_levels.index(d)) + + plhs, levelslhs = self._lhs._tensor, self._lhs._levels + prhs, levelsrhs = self._rhs._tensor, self._rhs._levels + new_dims = tuple(d for d in self.dims if d not in dims) + new_levels = [l for l in self._levels if l not in dims] + fmt = "".join( + [ + *(to_char(d) for d in levelslhs), + ",", + *(to_char(d) for d in levelsrhs), + "->", + *(to_char(d) for d in new_levels), + ] + ) + result_data = torch.einsum(fmt, (plhs, prhs)) + return Tensor.from_positional(result_data, new_levels, True) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/dim.py b/.venv/lib/python3.11/site-packages/functorch/dim/dim.py new file mode 100644 index 0000000000000000000000000000000000000000..cbafce2f0ee0c433d82c880a81a1e34eba519a96 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/dim.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import dis +import inspect +from dataclasses import dataclass +from typing import Union + +from . import DimList + + +_vmap_levels = [] + + +@dataclass +class LevelInfo: + level: int + alive: bool = True + + +class Dim: + def __init__(self, name: str, size: Union[None, int] = None): + self.name = name + self._size = None + self._vmap_level = None + if size is not None: + self.size = size + + def __del__(self): + if self._vmap_level is not None: + _vmap_active_levels[self._vmap_stack].alive = False # noqa: F821 + while ( + not _vmap_levels[-1].alive + and current_level() == _vmap_levels[-1].level # noqa: F821 + ): + _vmap_decrement_nesting() # noqa: F821 + _vmap_levels.pop() + + @property + def size(self): + assert self.is_bound + return self._size + + @size.setter + def size(self, size: int): + from . import DimensionBindError + + if self._size is None: + self._size = size + self._vmap_level = _vmap_increment_nesting(size, "same") # noqa: F821 + self._vmap_stack = len(_vmap_levels) + _vmap_levels.append(LevelInfo(self._vmap_level)) + + elif self._size != size: + raise DimensionBindError( + f"Dim '{self}' previously bound to a dimension of size {self._size} cannot bind to a dimension of size {size}" + ) + + @property + def is_bound(self): + return self._size is not None + + def __repr__(self): + return self.name + + +def extract_name(inst): + assert inst.opname == "STORE_FAST" or inst.opname == "STORE_NAME" + return inst.argval + + +_cache = {} + + +def dims(lists=0): + frame = inspect.currentframe() + assert frame is not None + calling_frame = frame.f_back + assert calling_frame is not None + code, lasti = calling_frame.f_code, calling_frame.f_lasti + key = (code, lasti) + if key not in _cache: + first = lasti // 2 + 1 + instructions = list(dis.get_instructions(calling_frame.f_code)) + unpack = instructions[first] + + if unpack.opname == "STORE_FAST" or unpack.opname == "STORE_NAME": + # just a single dim, not a list + name = unpack.argval + ctor = Dim if lists == 0 else DimList + _cache[key] = lambda: ctor(name=name) + else: + assert unpack.opname == "UNPACK_SEQUENCE" + ndims = unpack.argval + names = tuple( + extract_name(instructions[first + 1 + i]) for i in range(ndims) + ) + first_list = len(names) - lists + _cache[key] = lambda: tuple( + Dim(n) if i < first_list else DimList(name=n) + for i, n in enumerate(names) + ) + return _cache[key]() + + +def _dim_set(positional, arg): + def convert(a): + if isinstance(a, Dim): + return a + else: + assert isinstance(a, int) + return positional[a] + + if arg is None: + return positional + elif not isinstance(arg, (Dim, int)): + return tuple(convert(a) for a in arg) + else: + return (convert(arg),) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py b/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py new file mode 100644 index 0000000000000000000000000000000000000000..5c962a898ca79cfe3d8af7432aacc3802d4f4ade --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import os +import signal +import subprocess +from contextlib import contextmanager + + +@contextmanager +def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"): + pid = os.getpid() + if not os.path.exists(magic_trace_cache): + print(f"Downloading magic_trace to: {magic_trace_cache}") + subprocess.run( + [ + "wget", + "-O", + magic_trace_cache, + "-q", + "https://github.com/janestreet/magic-trace/releases/download/v1.0.2/magic-trace", + ] + ) + subprocess.run(["chmod", "+x", magic_trace_cache]) + args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output] + p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8") + while True: + x = p.stderr.readline() + print(x) + if "Attached" in x: + break + try: + yield + finally: + p.send_signal(signal.SIGINT) + r = p.wait() + print(p.stderr.read()) + p.stderr.close() + if r != 0: + raise ValueError(f"magic_trace exited abnormally: {r}") diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/op_properties.py b/.venv/lib/python3.11/site-packages/functorch/dim/op_properties.py new file mode 100644 index 0000000000000000000000000000000000000000..01313f71f030d58ce76c15c7f8516c4a0bdcf48a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/op_properties.py @@ -0,0 +1,312 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch + + +# pointwise operators can go through a faster pathway + +tensor_magic_methods = ["add", ""] +pointwise_magic_methods_with_reverse = ( + "add", + "sub", + "mul", + "floordiv", + "div", + "truediv", + "mod", + "pow", + "lshift", + "rshift", + "and", + "or", + "xor", +) +pointwise_magic_methods = ( + *(x for m in pointwise_magic_methods_with_reverse for x in (m, "r" + m)), + "eq", + "gt", + "le", + "lt", + "ge", + "gt", + "ne", + "neg", + "pos", + "abs", + "invert", + "iadd", + "isub", + "imul", + "ifloordiv", + "idiv", + "itruediv", + "imod", + "ipow", + "ilshift", + "irshift", + "iand", + "ior", + "ixor", + "int", + "long", + "float", + "complex", +) + +pointwise_methods = (*(f"__{m}__" for m in pointwise_magic_methods),) + +pointwise = ( + *(getattr(torch.Tensor, m) for m in pointwise_methods), + torch.nn.functional.dropout, + torch.where, + torch.Tensor.abs, + torch.abs, + torch.Tensor.acos, + torch.acos, + torch.Tensor.acosh, + torch.acosh, + torch.Tensor.add, + torch.add, + torch.Tensor.addcdiv, + torch.addcdiv, + torch.Tensor.addcmul, + torch.addcmul, + torch.Tensor.addr, + torch.addr, + torch.Tensor.angle, + torch.angle, + torch.Tensor.asin, + torch.asin, + torch.Tensor.asinh, + torch.asinh, + torch.Tensor.atan, + torch.atan, + torch.Tensor.atan2, + torch.atan2, + torch.Tensor.atanh, + torch.atanh, + torch.Tensor.bitwise_and, + torch.bitwise_and, + torch.Tensor.bitwise_left_shift, + torch.bitwise_left_shift, + torch.Tensor.bitwise_not, + torch.bitwise_not, + torch.Tensor.bitwise_or, + torch.bitwise_or, + torch.Tensor.bitwise_right_shift, + torch.bitwise_right_shift, + torch.Tensor.bitwise_xor, + torch.bitwise_xor, + torch.Tensor.ceil, + torch.ceil, + torch.celu, + torch.nn.functional.celu, + torch.Tensor.clamp, + torch.clamp, + torch.Tensor.clamp_max, + torch.clamp_max, + torch.Tensor.clamp_min, + torch.clamp_min, + torch.Tensor.copysign, + torch.copysign, + torch.Tensor.cos, + torch.cos, + torch.Tensor.cosh, + torch.cosh, + torch.Tensor.deg2rad, + torch.deg2rad, + torch.Tensor.digamma, + torch.digamma, + torch.Tensor.div, + torch.div, + torch.dropout, + torch.nn.functional.dropout, + torch.nn.functional.elu, + torch.Tensor.eq, + torch.eq, + torch.Tensor.erf, + torch.erf, + torch.Tensor.erfc, + torch.erfc, + torch.Tensor.erfinv, + torch.erfinv, + torch.Tensor.exp, + torch.exp, + torch.Tensor.exp2, + torch.exp2, + torch.Tensor.expm1, + torch.expm1, + torch.feature_dropout, + torch.Tensor.float_power, + torch.float_power, + torch.Tensor.floor, + torch.floor, + torch.Tensor.floor_divide, + torch.floor_divide, + torch.Tensor.fmod, + torch.fmod, + torch.Tensor.frac, + torch.frac, + torch.Tensor.frexp, + torch.frexp, + torch.Tensor.gcd, + torch.gcd, + torch.Tensor.ge, + torch.ge, + torch.nn.functional.gelu, + torch.nn.functional.glu, + torch.Tensor.gt, + torch.gt, + torch.Tensor.hardshrink, + torch.hardshrink, + torch.nn.functional.hardshrink, + torch.nn.functional.hardsigmoid, + torch.nn.functional.hardswish, + torch.nn.functional.hardtanh, + torch.Tensor.heaviside, + torch.heaviside, + torch.Tensor.hypot, + torch.hypot, + torch.Tensor.i0, + torch.i0, + torch.Tensor.igamma, + torch.igamma, + torch.Tensor.igammac, + torch.igammac, + torch.Tensor.isclose, + torch.isclose, + torch.Tensor.isfinite, + torch.isfinite, + torch.Tensor.isinf, + torch.isinf, + torch.Tensor.isnan, + torch.isnan, + torch.Tensor.isneginf, + torch.isneginf, + torch.Tensor.isposinf, + torch.isposinf, + torch.Tensor.isreal, + torch.isreal, + torch.Tensor.kron, + torch.kron, + torch.Tensor.lcm, + torch.lcm, + torch.Tensor.ldexp, + torch.ldexp, + torch.Tensor.le, + torch.le, + torch.nn.functional.leaky_relu, + torch.Tensor.lerp, + torch.lerp, + torch.Tensor.lgamma, + torch.lgamma, + torch.Tensor.log, + torch.log, + torch.Tensor.log10, + torch.log10, + torch.Tensor.log1p, + torch.log1p, + torch.Tensor.log2, + torch.log2, + torch.nn.functional.logsigmoid, + torch.Tensor.logical_and, + torch.logical_and, + torch.Tensor.logical_not, + torch.logical_not, + torch.Tensor.logical_or, + torch.logical_or, + torch.Tensor.logical_xor, + torch.logical_xor, + torch.Tensor.logit, + torch.logit, + torch.Tensor.lt, + torch.lt, + torch.Tensor.maximum, + torch.maximum, + torch.Tensor.minimum, + torch.minimum, + torch.nn.functional.mish, + torch.Tensor.mvlgamma, + torch.mvlgamma, + torch.Tensor.nan_to_num, + torch.nan_to_num, + torch.Tensor.ne, + torch.ne, + torch.Tensor.neg, + torch.neg, + torch.Tensor.nextafter, + torch.nextafter, + torch.Tensor.outer, + torch.outer, + torch.polar, + torch.Tensor.polygamma, + torch.polygamma, + torch.Tensor.positive, + torch.positive, + torch.Tensor.pow, + torch.pow, + torch.Tensor.prelu, + torch.prelu, + torch.nn.functional.prelu, + torch.Tensor.rad2deg, + torch.rad2deg, + torch.Tensor.reciprocal, + torch.reciprocal, + torch.Tensor.relu, + torch.relu, + torch.nn.functional.relu, + torch.nn.functional.relu6, + torch.Tensor.remainder, + torch.remainder, + torch.Tensor.round, + torch.round, + torch.rrelu, + torch.nn.functional.rrelu, + torch.Tensor.rsqrt, + torch.rsqrt, + torch.rsub, + torch.selu, + torch.nn.functional.selu, + torch.Tensor.sgn, + torch.sgn, + torch.Tensor.sigmoid, + torch.sigmoid, + torch.nn.functional.sigmoid, + torch.Tensor.sign, + torch.sign, + torch.Tensor.signbit, + torch.signbit, + torch.nn.functional.silu, + torch.Tensor.sin, + torch.sin, + torch.Tensor.sinc, + torch.sinc, + torch.Tensor.sinh, + torch.sinh, + torch.nn.functional.softplus, + torch.nn.functional.softshrink, + torch.Tensor.sqrt, + torch.sqrt, + torch.Tensor.square, + torch.square, + torch.Tensor.sub, + torch.sub, + torch.Tensor.tan, + torch.tan, + torch.Tensor.tanh, + torch.tanh, + torch.nn.functional.tanh, + torch.threshold, + torch.nn.functional.threshold, + torch.trapz, + torch.Tensor.true_divide, + torch.true_divide, + torch.Tensor.trunc, + torch.trunc, + torch.Tensor.xlogy, + torch.xlogy, + torch.rand_like, +) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/reference.py b/.venv/lib/python3.11/site-packages/functorch/dim/reference.py new file mode 100644 index 0000000000000000000000000000000000000000..ed7275bf666addb09bb8f4ae498df6de91d91273 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/reference.py @@ -0,0 +1,645 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# reference python implementations for C ops +import torch +from functorch._C import dim as _C + +from . import op_properties +from .batch_tensor import _enable_layers +from .tree_map import tree_flatten, tree_map + + +DimList = _C.DimList +import operator +from functools import reduce + + +# use dict to avoid writing C++ bindings for set +pointwise = set(op_properties.pointwise) + + +def prod(x): + return reduce(operator.mul, x, 1) + + +def _wrap_dim(d, N, keepdim): + from . import Dim + + if isinstance(d, Dim): + assert not keepdim, "cannot preserve first-class dimensions with keepdim=True" + return d + elif d >= 0: + return d - N + else: + return d + + +def _dims(d, N, keepdim, single_dim): + from . import Dim + + if isinstance(d, (Dim, int)): + return ltuple((_wrap_dim(d, N, keepdim),)) + assert not single_dim, f"expected a single dimension or int but found: {d}" + return ltuple(_wrap_dim(x, N, keepdim) for x in d) + + +def _bind_dims_to_size(lhs_size, rhs, lhs_debug): + from . import DimensionMismatchError + + not_bound = tuple((i, r) for i, r in enumerate(rhs) if not r.is_bound) + if len(not_bound) == 1: + idx, d = not_bound[0] + rhs_so_far = prod(r.size for r in rhs if r.is_bound) + if lhs_size % rhs_so_far != 0: + rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs) + raise DimensionMismatchError( + f"inferred dimension does not evenly fit into larger dimension: {lhs_size} vs {rhs_s}" + ) + new_size = lhs_size // rhs_so_far + d.size = new_size + elif len(not_bound) > 1: + rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs) + raise DimensionMismatchError( + f"cannot infer the size of two dimensions at once: {rhs} with sizes {rhs_s}" + ) + else: + rhs_size = prod(r.size for r in rhs) + if lhs_size != rhs_size: + raise DimensionMismatchError( + f"Dimension sizes to do not match ({lhs_size} != {rhs_size}) when matching {lhs_debug} to {rhs}" + ) + + +def _tensor_levels(inp): + from . import _Tensor + + if isinstance(inp, _Tensor): + return inp._tensor, llist(inp._levels), inp._has_device + else: + return inp, llist(range(-inp.ndim, 0)), True + + +def _match_levels(v, from_levels, to_levels): + view = [] + permute = [] + requires_view = False + size = v.size() + for t in to_levels: + try: + idx = from_levels.index(t) + permute.append(idx) + view.append(size[idx]) + except ValueError: + view.append(1) + requires_view = True + if permute != list(range(len(permute))): + v = v.permute(*permute) + if requires_view: + v = v.view(*view) + return v + + +# make a single dimension positional but do not permute it, +# used to do multi-tensor operators where the dim being acted on +# should not physically move if possible +def _positional_no_permute(self, dim, expand_dim=False): + from . import Tensor + + ptensor, levels = self._tensor, llist(self._levels) + try: + idx = levels.index(dim) + except ValueError: + if not expand_dim: + raise + idx = 0 + ptensor = ptensor.expand(dim.size, *ptensor.size()) + levels.insert(0, 0) + idx_batched = 0 + for i in range(idx): + if isinstance(levels[i], int): + levels[i] -= 1 + idx_batched += 1 + levels[idx] = -idx_batched - 1 + return Tensor.from_positional(ptensor, levels, self._has_device), idx_batched + + +def seq(a, b): + from . import Dim + + if isinstance(a, Dim) != isinstance(b, Dim): + return False + if isinstance(a, Dim): + return a is b + else: + return a == b + + +class isin: + def __contains__(self, item): + for x in self: + if seq(item, x): + return True + return False + + def index(self, item): + for i, x in enumerate(self): + if seq(item, x): + return i + raise ValueError + + +class llist(isin, list): + pass + + +class ltuple(isin, tuple): + pass + + +empty_dict = {} + + +@classmethod +def __torch_function__(self, orig, cls, args, kwargs=empty_dict): + from . import _Tensor, Tensor, TensorLike + from .delayed_mul_tensor import DelayedMulTensor + + if orig is torch.Tensor.__mul__: + lhs, rhs = args + if ( + isinstance(lhs, _Tensor) + and isinstance(rhs, _Tensor) + and lhs.ndim == 0 + and rhs.ndim == 0 + ): + return DelayedMulTensor(lhs, rhs) + all_dims = llist() + flat_args, unflatten = tree_flatten((args, kwargs)) + device_holding_tensor = None + for f in flat_args: + if isinstance(f, _Tensor): + if f._has_device: + device_holding_tensor = f._batchtensor + for d in f.dims: + if d not in all_dims: + all_dims.append(d) + + def unwrap(t): + if isinstance(t, _Tensor): + r = t._batchtensor + if device_holding_tensor is not None and not t._has_device: + r = r.to(device=device_holding_tensor.device) + return r + return t + + if orig in pointwise: + result_levels = llist() + arg_levels = llist() + to_expand = [] + for i, f in enumerate(flat_args): + if isinstance(f, TensorLike): + ptensor, levels, _ = _tensor_levels(f) + if ( + isinstance(f, _Tensor) + and not f._has_device + and device_holding_tensor is not None + ): + ptensor = ptensor.to(device=device_holding_tensor.device) + flat_args[i] = ptensor + for l in levels: + if l not in result_levels: + result_levels.append(l) + to_expand.append((i, levels)) + + for i, levels in to_expand: + flat_args[i] = _match_levels(flat_args[i], levels, result_levels) + args, kwargs = unflatten(flat_args) + result = orig(*args, **kwargs) + + def wrap(t): + if isinstance(t, TensorLike): + return Tensor.from_positional( + t, result_levels, device_holding_tensor is not None + ) + return t + + return tree_map(wrap, result) + else: + + def wrap(t): + if isinstance(t, TensorLike): + return Tensor.from_batched(t, device_holding_tensor is not None) + return t + + with _enable_layers(all_dims): + print(f"batch_tensor for {orig}") + args, kwargs = unflatten(unwrap(f) for f in flat_args) + result = orig(*args, **kwargs) + # print("END", orig) + return tree_map(wrap, result) + + +def positional(self, *dims): + from . import Dim, DimensionBindError, Tensor + + ptensor, levels = self._tensor, llist(self._levels) + flat_dims = llist() + view = [] + needs_view = False + ndim = self.ndim + for d in dims: + if isinstance(d, DimList): + flat_dims.extend(d) + view.extend(e.size for e in d) + elif isinstance(d, Dim): + flat_dims.append(d) + view.append(d.size) + elif isinstance(d, int): + d = _wrap_dim(d, ndim, False) + flat_dims.append(d) + view.append(ptensor.size(d)) + else: + flat_dims.extend(d) + view.append(prod(e.size for e in d)) + needs_view = True + + permute = list(range(len(levels))) + nflat = len(flat_dims) + for i, d in enumerate(flat_dims): + try: + idx = levels.index(d) + except ValueError as e: + raise DimensionBindError( + f"tensor of dimensions {self.dims} does not contain dim {d}" + ) from e + p = permute[idx] + del levels[idx] + del permute[idx] + levels.insert(i, 0) + permute.insert(i, p) + ptensor = ptensor.permute(*permute) + seen = 0 + for i in range(len(levels) - 1, -1, -1): + if isinstance(levels[i], int): + seen += 1 + levels[i] = -seen + result = Tensor.from_positional(ptensor, levels, self._has_device) + if needs_view: + result = result.reshape(*view, *result.size()[len(flat_dims) :]) + return result + + +def _contains_dim(input): + from . import Dim + + for i in input: + if isinstance(i, Dim): + return True + + +def expand(self, *sizes): + if not _contains_dim(sizes): + return self.__torch_function__(torch.Tensor.expand, None, (self, *sizes)) + dims = sizes + sizes = [d.size for d in dims] + [-1] * self.ndim + self = self.expand(*sizes) + return self[dims] + + +_not_present = object() + + +def _getarg(name, offset, args, kwargs, default): + if len(args) > offset: + return args[offset] + return kwargs.get(name, default) + + +def _patcharg(name, offset, args, kwargs, value): + if len(args) > offset: + args[offset] = value + else: + kwargs[name] = value + + +def _wrap( + orig, dim_offset=0, keepdim_offset=1, dim_name="dim", single_dim=False, reduce=True +): + from . import Dim, Tensor, TensorLike + + def fn(self, *args, **kwargs): + dim = _getarg(dim_name, dim_offset, args, kwargs, _not_present) + if dim is _not_present or (single_dim and not isinstance(dim, Dim)): + with _enable_layers(self.dims): + print(f"dim fallback batch_tensor for {orig}") + return Tensor.from_batched( + orig(self._batchtensor, *args, **kwargs), self._has_device + ) + keepdim = ( + _getarg("keepdim", keepdim_offset, args, kwargs, False) if reduce else False + ) + t, levels = self._tensor, llist(self._levels) + dims = _dims(dim, self._batchtensor.ndim, keepdim, single_dim) + dim_indices = tuple(levels.index(d) for d in dims) + if reduce and not keepdim: + new_levels = [l for i, l in enumerate(levels) if i not in dim_indices] + else: + new_levels = levels + + if len(dim_indices) == 1: + dim_indices = dim_indices[ + 0 + ] # so that dims that really only take a single argument work... + args = list(args) + _patcharg(dim_name, dim_offset, args, kwargs, dim_indices) + + def wrap(t): + if isinstance(t, TensorLike): + return Tensor.from_positional(t, new_levels, self._has_device) + return t + + with _enable_layers(new_levels): + print(f"dim used batch_tensor for {orig}") + r = orig(t, *args, **kwargs) + return tree_map(wrap, r) + + return fn + + +def _def(name, *args, **kwargs): + from . import _Tensor + + orig = getattr(torch.Tensor, name) + setattr(_Tensor, name, _wrap(orig, *args, **kwargs)) + + +no_slice = slice(None) + +_orig_getitem = torch.Tensor.__getitem__ + + +class dim_tracker: + def __init__(self) -> None: + self.dims = llist() + self.count = [] + + def record(self, d): + if d not in self.dims: + self.dims.append(d) + self.count.append(1) + + def __getitem__(self, d): + return self.count[self.dims.index(d)] + + +def t__getitem__(self, input): + from . import _Tensor, Dim, DimensionBindError, DimList, Tensor, TensorLike + + # * bail to original example if we have a single non-Dim tensor, or a non-tensor + # * locate ... or an unbound tensor list, and determine its size, bind dim list + # (remember that None does not count to the total dim count) + # * bind simple dims and dim-packs to their sizes, count the number of uses of each dim, + # produce the re-view if needed + # * for each single-use dim index, replace with no_slice and mark that it will be added + # (keep track of whether we have to call super) + # * call super if needed + # * if we have dims to bind, bind them (it will help if we eliminated ... and None before) + # this handles bool indexing handling, as well as some other simple cases. + + is_simple = ( + not isinstance(input, Dim) + and not isinstance(input, (tuple, list)) + and + # WAR for functorch bug where zero time tensors in getitem are not handled correctly. + not (isinstance(input, TensorLike) and input.ndim == 0) + ) + + if is_simple: + if isinstance(self, _Tensor): + return _Tensor.__torch_function__(_orig_getitem, None, (self, input)) + else: + return _orig_getitem(self, input) + + # can further optimize this case + if not isinstance(input, tuple): + input = [input] + else: + input = list(input) + + dims_indexed = 0 + expanding_object = None + dimlists = [] + for i, s in enumerate(input): + if s is ... or isinstance(s, DimList) and not s.is_bound: + if expanding_object is not None: + msg = ( + "at most one ... or unbound dimension list can exist in indexing list but" + f" found 2 at offsets {i} and {expanding_object}" + ) + raise DimensionBindError(msg) + expanding_object = i + + if isinstance(s, DimList): + dims_indexed += len(s) if s.is_bound else 0 + dimlists.append(i) + elif s is not None and s is not ...: + dims_indexed += 1 + + ndim = self.ndim + if dims_indexed > ndim: + raise IndexError( + f"at least {dims_indexed} indices were supplied but the tensor only has {ndim} dimensions." + ) + if expanding_object is not None: + expanding_ndims = ndim - dims_indexed + obj = input[expanding_object] + if obj is ...: + input[expanding_object : expanding_object + 1] = [ + no_slice + ] * expanding_ndims + else: + obj.bind_len(expanding_ndims) + # flatten the dimslists into the indexing + for i in reversed(dimlists): + input[i : i + 1] = input[i] + dims_indexed = 0 + requires_view = False + size = self.size() + view_sizes = [] + dims_seen = dim_tracker() + + def add_dims(t): + if not isinstance(t, _Tensor): + return + for d in t.dims: + dims_seen.record(d) + + add_dims(self) + dim_packs = [] + for i, idx in enumerate(input): + if idx is None: + input[i] = no_slice + view_sizes.append(1) + requires_view = True + else: + sz = size[dims_indexed] + if isinstance(idx, Dim): + idx.size = sz + dims_seen.record(idx) + view_sizes.append(sz) + elif isinstance(idx, (tuple, list)) and idx and isinstance(idx[0], Dim): + for d in idx: + dims_seen.record(idx) + _bind_dims_to_size(sz, idx, f"offset {i}") + view_sizes.extend(d.size for d in idx) + requires_view = True + dim_packs.append(i) + else: + add_dims(idx) + view_sizes.append(sz) + dims_indexed += 1 + if requires_view: + self = self.view(*view_sizes) + for i in reversed(dim_packs): + input[i : i + 1] = input[i] + + # currenty: + # input is flat, containing either Dim, or Tensor, or something valid for standard indexing + # self may have first-class dims as well. + + # to index: + # drop the first class dims from self, they just become direct indices of their positions + + # figure out the dimensions of the indexing tensors: union of all the dims in the tensors in the index. + # these dimensions will appear and need to be bound at the first place tensor occures + + if isinstance(self, _Tensor): + ptensor_self, levels = self._tensor, list(self._levels) + # indices to ptensor rather than self which has first-class dimensions + input_it = iter(input) + flat_inputs = [next(input_it) if isinstance(l, int) else l for l in levels] + has_device = self._has_device + to_pad = 0 + else: + ptensor_self, flat_inputs = self, input + to_pad = ptensor_self.ndim - len(flat_inputs) + has_device = True + + result_levels = [] + index_levels = [] + tensor_insert_point = None + to_expand = {} + requires_getindex = False + for i, inp in enumerate(flat_inputs): + if isinstance(inp, Dim) and dims_seen[inp] == 1: + flat_inputs[i] = no_slice + result_levels.append(inp) + elif isinstance(inp, TensorLike): + requires_getindex = True + if tensor_insert_point is None: + tensor_insert_point = len(result_levels) + ptensor, levels, _ = _tensor_levels(inp) + to_expand[i] = levels + flat_inputs[i] = ptensor + for l in levels: + if l not in index_levels: + index_levels.append(l) + else: + requires_getindex = True + result_levels.append(0) + + if tensor_insert_point is not None: + result_levels[tensor_insert_point:tensor_insert_point] = index_levels + + for i, levels in to_expand.items(): + flat_inputs[i] = _match_levels(flat_inputs[i], levels, index_levels) + + if requires_getindex: + result = _orig_getitem(ptensor_self, flat_inputs) + else: + result = ptensor_self + + next_positional = -1 + if to_pad > 0: + result_levels.extend([0] * to_pad) + for i, r in enumerate(reversed(result_levels)): + if isinstance(r, int): + result_levels[-1 - i] = next_positional + next_positional -= 1 + + return Tensor.from_positional(result, result_levels, has_device) + + +# XXX - dim is optional and can be the outer-most dimension... +def stack(tensors, new_dim, dim=0, out=None): + if isinstance(dim, int): + return torch.stack(tensors, dim, out).index(dim, new_dim) + index = None + if out is not None: + out, index = _positional_no_permute(out, dim, expand_dim=True) + ptensors = [] + for t in tensors: + pt, pi = _positional_no_permute(t, dim, expand_dim=True) + if index is not None and pi != index: + pt = pt.move_dim(pi, index) + else: + index = pi + ptensors.append(pt) + pr = torch.stack(ptensors, index, out=out) + return pr.index((index, index + 1), (new_dim, dim)) + + +_orig_split = torch.Tensor.split + + +def split(self, split_size_or_sections, dim=0): + from . import _Tensor, Dim + + if isinstance(split_size_or_sections, int) or any( + isinstance(t, int) for t in split_size_or_sections + ): + if isinstance(dim, Dim): + raise ValueError( + "when dim is specified as a Dim object, split sizes must also be dimensions." + ) + return _orig_split(self, split_size_or_sections, dim=dim) + + if isinstance(dim, Dim): + assert isinstance(self, _Tensor), f"Tensor does not have dimension {dim}" + self, dim = _positional_no_permute(self, dim) + + size = self.size(dim) + total_bound_size = 0 + unbound = [] + sizes = [] + for i, d in enumerate(split_size_or_sections): + if d.is_bound: + sizes.append(d.size) + total_bound_size += d.size + else: + sizes.append(0) + unbound.append(i) + + if unbound: + assert ( + total_bound_size <= size + ), f"result dimensions are larger than original: {total_bound_size} vs {size} ({split_size_or_sections})" + remaining_size = size - total_bound_size + chunk_size = -(-remaining_size // len(unbound)) + for u in unbound: + sz = min(chunk_size, remaining_size) + split_size_or_sections[u].size = sz + sizes[u] = sz + remaining_size -= sz + else: + assert ( + total_bound_size == size + ), f"result dimensions do not match original: {total_bound_size} vs {size} ({split_size_or_sections})" + return tuple( + t.index(dim, d) + for d, t in zip(split_size_or_sections, _orig_split(self, sizes, dim=dim)) + ) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/tree_map.py b/.venv/lib/python3.11/site-packages/functorch/dim/tree_map.py new file mode 100644 index 0000000000000000000000000000000000000000..3d2eae0582c85619e0d28e648af58cbd847eee97 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/tree_map.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from functorch._C import dim + + +tree_flatten = dim.tree_flatten + + +def tree_map(fn, tree): + vs, unflatten = tree_flatten(tree) + return unflatten(fn(v) for v in vs) diff --git a/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py b/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py new file mode 100644 index 0000000000000000000000000000000000000000..aae543b91a896e2d5cef3b03d725c3d91ce1305d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from types import ( + BuiltinMethodType, + FunctionType, + GetSetDescriptorType, + MethodDescriptorType, + WrapperDescriptorType, +) + +from functorch._C import dim as _C + + +_wrap_method = _C._wrap_method + +FUNC_TYPES = ( + FunctionType, + MethodDescriptorType, + BuiltinMethodType, + WrapperDescriptorType, +) +PROPERTY_TYPES = (GetSetDescriptorType, property) + + +def _py_wrap_method(orig, __torch_function__): + def impl(*args, **kwargs): + return __torch_function__(orig, None, args, kwargs) + + return impl + + +def wrap_type(use_c, to_patch, pattern, __torch_function__): + if use_c: + wrap_method = _wrap_method + else: + wrap_method = _py_wrap_method + + all = {} + for t in reversed(pattern.mro()[:-1]): # skip object + all.update(t.__dict__) + + def wrap_attr(orig): + return property(wrap_method(orig.__get__, __torch_function__)) + + for name, obj in all.items(): + if name in ( + "__dict__", + "__new__", + "__init__", + "__repr__", + "__weakref__", + "__doc__", + "__module__", + "__dir__", + ): + continue + + # skip things that have been overloaded + # things that come from object like `__eq__` still need to be patched, however. + if hasattr(to_patch, name) and getattr(to_patch, name) is not getattr( + object, name, None + ): + continue + + if isinstance(obj, FUNC_TYPES): + setattr(to_patch, name, wrap_method(obj, __torch_function__)) + elif isinstance(obj, PROPERTY_TYPES): + setattr(to_patch, name, wrap_attr(obj)) diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py b/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7ac34f7a3722010fc0fde97fd1cd72e76fa88b7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py @@ -0,0 +1,4 @@ +from .rearrange import rearrange + + +__all__ = ["rearrange"] diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a3a968777d1a2d6f3552c65162d971cce7da523 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e92b64e76742f2789e8553f0087807a34e6c19d2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f02da3d167ce9e99a11ada4c8770715dfeca44d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/_parsing.py b/.venv/lib/python3.11/site-packages/functorch/einops/_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..ffb1fc00a20ee9923f63590f522c6dad206f942b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/einops/_parsing.py @@ -0,0 +1,303 @@ +"""Adapted from https://github.com/arogozhnikov/einops/blob/36c7bb16e57d6e57f8f3050f9e07abdf3f00469f/einops/parsing.py. + +MIT License + +Copyright (c) 2018 Alex Rogozhnikov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from __future__ import annotations + +import keyword +import warnings +from typing import Collection, List, Mapping, Optional, Set, Tuple, Union + + +_ellipsis: str = "\u2026" # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated + + +class AnonymousAxis: + """Used by `ParsedExpression` to represent an axis with a size (> 1), but no associated identifier. + + Note: Different instances of this class are not equal to each other, even if they have the same value. + """ + + def __init__(self, value: str) -> None: + self.value = int(value) + if self.value < 1: + raise ValueError( + f"Anonymous axis should have positive length, not {self.value}" + ) + + def __repr__(self) -> str: + return f"{self.value}-axis" + + +class ParsedExpression: + """Structure containing information about one side of an `einops`-style pattern (e.g. 'b c (h w)').""" + + def __init__( + self, + expression: str, + *, + allow_underscore: bool = False, + allow_duplicates: bool = False, + ) -> None: + """Parse the expression and store relevant metadata. + + Args: + expression (str): the `einops`-pattern to parse + allow_underscore (bool): whether to allow axis identifier names to begin with an underscore + allow_duplicates (bool): whether to allow an identifier to appear more than once in the expression + """ + self.has_ellipsis: bool = False + self.has_ellipsis_parenthesized: Optional[bool] = None + self.identifiers: Set[Union[str, AnonymousAxis]] = set() + # that's axes like 2, 3, 4 or 5. Axes with size 1 are exceptional and replaced with empty composition + self.has_non_unitary_anonymous_axes: bool = False + # composition keeps structure of composite axes, see how different corner cases are handled in tests + self.composition: List[Union[List[Union[str, AnonymousAxis]], str]] = [] + if "." in expression: + if "..." not in expression: + raise ValueError( + "Expression may contain dots only inside ellipsis (...)" + ) + if str.count(expression, "...") != 1 or str.count(expression, ".") != 3: + raise ValueError( + "Expression may contain dots only inside ellipsis (...); only one ellipsis for tensor " + ) + expression = expression.replace("...", _ellipsis) + self.has_ellipsis = True + + bracket_group: Optional[List[Union[str, AnonymousAxis]]] = None + + def add_axis_name(x: str) -> None: + if x in self.identifiers: + if not (allow_underscore and x == "_") and not allow_duplicates: + raise ValueError( + f"Indexing expression contains duplicate dimension '{x}'" + ) + if x == _ellipsis: + self.identifiers.add(_ellipsis) + if bracket_group is None: + self.composition.append(_ellipsis) + self.has_ellipsis_parenthesized = False + else: + bracket_group.append(_ellipsis) + self.has_ellipsis_parenthesized = True + else: + is_number = str.isdecimal(x) + if is_number and int(x) == 1: + # handling the case of anonymous axis of length 1 + if bracket_group is None: + self.composition.append([]) + else: + pass # no need to think about 1s inside parenthesis + return + is_axis_name, reason = self.check_axis_name_return_reason( + x, allow_underscore=allow_underscore + ) + if not (is_number or is_axis_name): + raise ValueError(f"Invalid axis identifier: {x}\n{reason}") + axis_name: Union[str, AnonymousAxis] = ( + AnonymousAxis(x) if is_number else x + ) + self.identifiers.add(axis_name) + if is_number: + self.has_non_unitary_anonymous_axes = True + if bracket_group is None: + self.composition.append([axis_name]) + else: + bracket_group.append(axis_name) + + current_identifier = None + for char in expression: + if char in "() ": + if current_identifier is not None: + add_axis_name(current_identifier) + current_identifier = None + if char == "(": + if bracket_group is not None: + raise ValueError( + "Axis composition is one-level (brackets inside brackets not allowed)" + ) + bracket_group = [] + elif char == ")": + if bracket_group is None: + raise ValueError("Brackets are not balanced") + self.composition.append(bracket_group) + bracket_group = None + elif str.isalnum(char) or char in ["_", _ellipsis]: + if current_identifier is None: + current_identifier = char + else: + current_identifier += char + else: + raise ValueError(f"Unknown character '{char}'") + + if bracket_group is not None: + raise ValueError(f"Imbalanced parentheses in expression: '{expression}'") + if current_identifier is not None: + add_axis_name(current_identifier) + + @staticmethod + def check_axis_name_return_reason( + name: str, allow_underscore: bool = False + ) -> Tuple[bool, str]: + """Check if the given axis name is valid, and a message explaining why if not. + + Valid axes names are python identifiers except keywords, and should not start or end with an underscore. + + Args: + name (str): the axis name to check + allow_underscore (bool): whether axis names are allowed to start with an underscore + + Returns: + Tuple[bool, str]: whether the axis name is valid, a message explaining why if not + """ + if not str.isidentifier(name): + return False, "not a valid python identifier" + elif name[0] == "_" or name[-1] == "_": + if name == "_" and allow_underscore: + return True, "" + return False, "axis name should should not start or end with underscore" + else: + if keyword.iskeyword(name): + warnings.warn( + f"It is discouraged to use axes names that are keywords: {name}", + RuntimeWarning, + ) + if name in ["axis"]: + warnings.warn( + "It is discouraged to use 'axis' as an axis name and will raise an error in future", + FutureWarning, + ) + return True, "" + + @staticmethod + def check_axis_name(name: str) -> bool: + """Check if the name is a valid axis name. + + Args: + name (str): the axis name to check + + Returns: + bool: whether the axis name is valid + """ + is_valid, _ = ParsedExpression.check_axis_name_return_reason(name) + return is_valid + + +def parse_pattern( + pattern: str, axes_lengths: Mapping[str, int] +) -> Tuple[ParsedExpression, ParsedExpression]: + """Parse an `einops`-style pattern into a left-hand side and right-hand side `ParsedExpression` object. + + Args: + pattern (str): the `einops`-style rearrangement pattern + axes_lengths (Mapping[str, int]): any additional length specifications for dimensions + + Returns: + Tuple[ParsedExpression, ParsedExpression]: a tuple containing the left-hand side and right-hand side expressions + """ + # adapted from einops.einops._prepare_transformation_recipe + # https://github.com/arogozhnikov/einops/blob/230ac1526c1f42c9e1f7373912c7f8047496df11/einops/einops.py + try: + left_str, right_str = pattern.split("->") + except ValueError: + raise ValueError("Pattern must contain a single '->' separator") from None + + if _ellipsis in axes_lengths: + raise ValueError(f"'{_ellipsis}' is not an allowed axis identifier") + + left = ParsedExpression(left_str) + right = ParsedExpression(right_str) + + if not left.has_ellipsis and right.has_ellipsis: + raise ValueError( + f"Ellipsis found in right side, but not left side of a pattern {pattern}" + ) + if left.has_ellipsis and left.has_ellipsis_parenthesized: + raise ValueError( + f"Ellipsis is parenthesis in the left side is not allowed: {pattern}" + ) + + return left, right + + +def validate_rearrange_expressions( + left: ParsedExpression, right: ParsedExpression, axes_lengths: Mapping[str, int] +) -> None: + """Perform expression validations that are specific to the `rearrange` operation. + + Args: + left (ParsedExpression): left-hand side expression + right (ParsedExpression): right-hand side expression + axes_lengths (Mapping[str, int]): any additional length specifications for dimensions + """ + for length in axes_lengths.values(): + if (length_type := type(length)) is not int: + raise TypeError( + f"rearrange axis lengths must be integers, got: {length_type}" + ) + + if left.has_non_unitary_anonymous_axes or right.has_non_unitary_anonymous_axes: + raise ValueError("rearrange only supports unnamed axes of size 1") + + difference = set.symmetric_difference(left.identifiers, right.identifiers) + if len(difference) > 0: + raise ValueError( + f"Identifiers only on one side of rearrange expression (should be on both): {difference}" + ) + + unmatched_axes = axes_lengths.keys() - left.identifiers + if len(unmatched_axes) > 0: + raise ValueError( + f"Identifiers not found in rearrange expression: {unmatched_axes}" + ) + + +def comma_separate(collection: Collection[Union[str, Collection[str]]]) -> str: + """Convert a collection of strings representing first class dims into a comma-separated string. + + Args: + collection (Collection[Union[str, Collection[str]]]): the collection of strings to convert + + Returns: + str: the comma-separated string + + Examples: + >>> comma_separate(('d0',)) + 'd0' + + >>> comma_separate(('d0', 'd1', 'd2', 'd3')) + 'd0, d1, d2, d3' + + >>> comma_separate([('d1', 'd4')]) + '(d1, d4)' + + >>> comma_separate([('d0',), (), ('d1',), ('d2',), ('d3', 'd4')]) + '(d0,), (), (d1,), (d2,), (d3, d4)' + """ + return ", ".join( + item + if isinstance(item, str) + else f"({comma_separate(item)}{',' if len(item) == 1 else ''})" + for item in collection + ) diff --git a/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py b/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py new file mode 100644 index 0000000000000000000000000000000000000000..1cd3cd8b3cf64f10b2003ebea441ad6a859ee9f4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import functools +from typing import Callable, Dict, List, Sequence, Tuple, Union + +import torch +from functorch._C import dim as _C + +from ._parsing import ( + _ellipsis, + AnonymousAxis, + comma_separate, + parse_pattern, + validate_rearrange_expressions, +) + + +__all__ = ["rearrange"] + +dims = _C.dims + + +@functools.lru_cache(256) +def _create_rearrange_callable( + tensor_ndim: int, pattern: str, **axes_lengths: int +) -> Callable[[torch.Tensor], torch.Tensor]: + r"""Translate an `einops`-style pattern into a callable that performs the rearrange using first-class dimensions. + + Since the an equivalent result is computed for tensors with the same number of dimensions, with the same pattern and + specified axes lengths, this function can be memoized. + + Args: + tensor_ndim (int): the number of dimensions in the tensor to rearrange + pattern (str): the `einops`-style rearrangement pattern + axes_lengths (int): any additional length specifications for dimensions + + Returns: + Callable[[torch.Tensor], torch.Tensor]: a callable that performs the rearrangement + """ + left, right = parse_pattern(pattern, axes_lengths) + validate_rearrange_expressions(left, right, axes_lengths) + + n_anon_dims = sum(not dim for dim in left.composition) + if left.has_ellipsis: + n_ellipsis_dims = tensor_ndim - (len(left.composition) - 1) + n_named_dims = len(left.identifiers) - 1 + + if (pattern_ndim := n_anon_dims + n_named_dims) > tensor_ndim: + raise ValueError( + f"Number of dimensions in pattern ({pattern_ndim}) must be less than or equal to the number of " + f"dimensions in the tensor ({tensor_ndim})" + ) + else: + n_ellipsis_dims = 0 + n_named_dims = len(left.identifiers) + + if (pattern_ndim := len(left.composition)) != tensor_ndim: + raise ValueError( + f"Number of dimensions in pattern ({pattern_ndim}) must be equal to the number of dimensions in " + f"the tensor ({tensor_ndim})" + ) + n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims + + if n_dims == 0: + # an identity rearrangement on a 0-dimension tensor + return lambda tensor: tensor + + first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims)) + identifier_dim_map: Dict[Union[str, AnonymousAxis], Tuple[str, ...]] = {} + anon_axes: List[AnonymousAxis] = [] + + # map the left-hand side identifiers to strings representing first class dims + dims_i = 0 + for dimension in left.composition: + if isinstance(dimension, list): + for identifier in dimension: + # non-unitary anon axes are not allowed in rearrange & unitary anon axes are represented as empty lists + assert isinstance(identifier, str) + identifier_dim_map[identifier] = (first_class_dims[dims_i],) + dims_i += 1 + if not dimension: + # unitary anonymous axis + anon_axis = AnonymousAxis("1") + identifier_dim_map[anon_axis] = (first_class_dims[dims_i],) + anon_axes.append(anon_axis) + dimension.append(anon_axis) + dims_i += 1 + elif dimension == _ellipsis: + identifier = _ellipsis + identifier_dim_map[identifier] = tuple( + first_class_dims[dims_i + j] for j in range(n_ellipsis_dims) + ) + dims_i += n_ellipsis_dims + else: + raise ValueError(f"Unexpected dimension: {dimension}") + + def composition_to_dims( + composition: Sequence[Union[List[Union[str, AnonymousAxis]], str]] + ) -> List[Union[str, Tuple[str, ...]]]: + """Convert a `ParsedExpression.composition` into a `Tensor.__getitem__` index of strings representing first + class dims.""" + dim_composition: List[Union[str, Tuple[str, ...]]] = [] + for dimension in composition: + if isinstance(dimension, list): + dim_composition.append( + tuple( + dim + for identifier in dimension + for dim in identifier_dim_map[identifier] + ) + ) + elif dimension == _ellipsis: + dim_composition.extend(identifier_dim_map[_ellipsis]) + else: + raise ValueError(f"Unexpected dimension: {dimension}") + return dim_composition + + left_dims = composition_to_dims(left.composition) + right_dims = composition_to_dims(right.composition) + anon_dims = tuple(identifier_dim_map[axis][0] for axis in anon_axes) + specified_lengths = tuple( + (identifier_dim_map[axis][0], length) for axis, length in axes_lengths.items() + ) + + custom_rearrange_callable_name = "do_rearrange" + custom_rearrange_callable_code = ( + ( + f"def {custom_rearrange_callable_name}(tensor):\n" + f" {comma_separate(first_class_dims)} = dims({n_dims})\n" + ) + + ( + "".join( + f" {dim}.size = {length}\n" for (dim, length) in specified_lengths + ) + if specified_lengths + else "" + ) + + f" tensor = tensor[{comma_separate(left_dims)}].order({comma_separate(right_dims)})\n" + + ( + f" return tensor.sum({comma_separate([anon_dims])}, keepdim=False)\n" + if anon_dims + else " return tensor\n" + ) + ) + + exec(custom_rearrange_callable_code) + return locals()[custom_rearrange_callable_name] + + +def rearrange( + tensor: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor, ...]], + pattern: str, + **axes_lengths: int, +) -> torch.Tensor: + r"""A native implementation of `einops.rearrange`, a reader-friendly smart element reordering for multidimensional + tensors. This operation includes functionality of transpose (axes permutation), reshape (view), squeeze, unsqueeze, + stack, concatenate and other operations. + + See: https://einops.rocks/api/rearrange/ + + Args: + tensor (Tensor or sequence of Tensor): the tensor(s) to rearrange + pattern (str): the rearrangement pattern + axes_lengths (int): any additional length specifications for dimensions + + Returns: + Tensor: the rearranged tensor + + Examples: + >>> # suppose we have a set of 32 images in "h w c" format (height-width-channel) + >>> images = torch.randn((32, 30, 40, 3)) + + >>> # stack along first (batch) axis, output is a single array + >>> rearrange(images, 'b h w c -> b h w c').shape + torch.Size([32, 30, 40, 3]) + + >>> # concatenate images along height (vertical axis), 960 = 32 * 30 + >>> rearrange(images, 'b h w c -> (b h) w c').shape + torch.Size([960, 40, 3]) + + >>> # concatenated images along horizontal axis, 1280 = 32 * 40 + >>> rearrange(images, 'b h w c -> h (b w) c').shape + torch.Size([30, 1280, 3]) + + >>> # reordered axes to "b c h w" format for deep learning + >>> rearrange(images, 'b h w c -> b c h w').shape + torch.Size([32, 3, 30, 40]) + + >>> # flattened each image into a vector, 3600 = 30 * 40 * 3 + >>> rearrange(images, 'b h w c -> b (c h w)').shape + torch.Size([32, 3600]) + + >>> # split each image into 4 smaller (top-left, top-right, bottom-left, bottom-right), 128 = 32 * 2 * 2 + >>> rearrange(images, 'b (h1 h) (w1 w) c -> (b h1 w1) h w c', h1=2, w1=2).shape + torch.Size([128, 15, 20, 3]) + + >>> # space-to-depth operation + >>> rearrange(images, 'b (h h1) (w w1) c -> b h w (c h1 w1)', h1=2, w1=2).shape + torch.Size([32, 15, 20, 12]) + """ + if not isinstance(tensor, torch.Tensor): + tensor = torch.stack(tensor) + + rearrange_callable = _create_rearrange_callable( + tensor.ndim, pattern, **axes_lengths + ) + + return rearrange_callable(tensor) diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/__init__.py b/.venv/lib/python3.11/site-packages/functorch/experimental/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3941f6d96e1f6df532966d06de4f72a952a58465 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/experimental/__init__.py @@ -0,0 +1,5 @@ +# PyTorch forward-mode is not mature yet +from functorch import functionalize +from torch._functorch.apis import chunk_vmap +from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_ +from torch._functorch.eager_transforms import hessian, jacfwd, jvp diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7717c29297ab83877ac7bc79f3c17c35b66e7eb5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7679cd16f50b3b97364e75b0212796213bf12ba Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57fbea5122ae242722c00e31d4361698aa5ad207 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py b/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..cbfd76d184cc224042a1ee19454d6957c2221b3b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py @@ -0,0 +1,7 @@ +from torch import cond # noqa: F401 +from torch._higher_order_ops.cond import UnsupportedAliasMutationException # noqa: F401 +from torch._higher_order_ops.map import ( # noqa: F401 + _stack_pytree, + _unstack_pytree, + map, +) diff --git a/.venv/lib/python3.11/site-packages/functorch/experimental/ops.py b/.venv/lib/python3.11/site-packages/functorch/experimental/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7a502ef2b002cd824e7b67d08fccac872b313110 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/functorch/experimental/ops.py @@ -0,0 +1 @@ +from torch._ops import HigherOrderOperator # noqa: F401