koichi12 commited on Feb 12, 2025

Commit

3914b7f

verified ·

1 Parent(s): 2311118

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/fsspec/__init__.py +69 -0
.venv/lib/python3.11/site-packages/fsspec/_version.py +16 -0
.venv/lib/python3.11/site-packages/fsspec/archive.py +75 -0
.venv/lib/python3.11/site-packages/fsspec/asyn.py +1098 -0
.venv/lib/python3.11/site-packages/fsspec/caching.py +966 -0
.venv/lib/python3.11/site-packages/fsspec/callbacks.py +324 -0
.venv/lib/python3.11/site-packages/fsspec/compression.py +175 -0
.venv/lib/python3.11/site-packages/fsspec/config.py +131 -0
.venv/lib/python3.11/site-packages/fsspec/conftest.py +55 -0
.venv/lib/python3.11/site-packages/fsspec/core.py +743 -0
.venv/lib/python3.11/site-packages/fsspec/dircache.py +98 -0
.venv/lib/python3.11/site-packages/fsspec/exceptions.py +18 -0
.venv/lib/python3.11/site-packages/fsspec/fuse.py +324 -0
.venv/lib/python3.11/site-packages/fsspec/generic.py +411 -0
.venv/lib/python3.11/site-packages/fsspec/gui.py +416 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py +304 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py +152 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py +467 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py +384 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py +124 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/local.py +476 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py +1306 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py +180 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py +124 -0
.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py +485 -0
.venv/lib/python3.11/site-packages/fsspec/json.py +121 -0
.venv/lib/python3.11/site-packages/fsspec/mapping.py +251 -0
.venv/lib/python3.11/site-packages/fsspec/parquet.py +541 -0
.venv/lib/python3.11/site-packages/fsspec/registry.py +315 -0
.venv/lib/python3.11/site-packages/fsspec/spec.py +2242 -0
.venv/lib/python3.11/site-packages/fsspec/transaction.py +90 -0
.venv/lib/python3.11/site-packages/fsspec/utils.py +739 -0
.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/__init__.py +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py +8 -0
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py +7 -0
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py +4 -0
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py +16 -0
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/compile/__init__.py +30 -0
.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/dim/__init__.py +181 -0
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/fsspec/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from importlib.metadata import entry_points
+from . import caching
+from ._version import __version__  # noqa: F401
+from .callbacks import Callback
+from .compression import available_compressions
+from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
+from .exceptions import FSTimeoutError
+from .mapping import FSMap, get_mapper
+from .registry import (
+    available_protocols,
+    filesystem,
+    get_filesystem_class,
+    register_implementation,
+    registry,
+)
+from .spec import AbstractFileSystem
+__all__ = [
+    "AbstractFileSystem",
+    "FSTimeoutError",
+    "FSMap",
+    "filesystem",
+    "register_implementation",
+    "get_filesystem_class",
+    "get_fs_token_paths",
+    "get_mapper",
+    "open",
+    "open_files",
+    "open_local",
+    "registry",
+    "caching",
+    "Callback",
+    "available_protocols",
+    "available_compressions",
+    "url_to_fs",
+]
+def process_entries():
+    if entry_points is not None:
+        try:
+            eps = entry_points()
+        except TypeError:
+            pass  # importlib-metadata < 0.8
+        else:
+            if hasattr(eps, "select"):  # Python 3.10+ / importlib_metadata >= 3.9.0
+                specs = eps.select(group="fsspec.specs")
+            else:
+                specs = eps.get("fsspec.specs", [])
+            registered_names = {}
+            for spec in specs:
+                err_msg = f"Unable to load filesystem from {spec}"
+                name = spec.name
+                if name in registered_names:
+                    continue
+                registered_names[name] = True
+                register_implementation(
+                    name,
+                    spec.value.replace(":", "."),
+                    errtxt=err_msg,
+                    # We take our implementations as the ones to overload with if
+                    # for some reason we encounter some, may be the same, already
+                    # registered
+                    clobber=True,
+                )
+process_entries()

.venv/lib/python3.11/site-packages/fsspec/_version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '2025.2.0'
+__version_tuple__ = version_tuple = (2025, 2, 0)

.venv/lib/python3.11/site-packages/fsspec/archive.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import operator
+from fsspec import AbstractFileSystem
+from fsspec.utils import tokenize
+class AbstractArchiveFileSystem(AbstractFileSystem):
+    """
+    A generic superclass for implementing Archive-based filesystems.
+    Currently, it is shared amongst
+    :class:`~fsspec.implementations.zip.ZipFileSystem`,
+    :class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
+    :class:`~fsspec.implementations.tar.TarFileSystem`.
+    """
+    def __str__(self):
+        return f"<Archive-like object {type(self).__name__} at {id(self)}>"
+    __repr__ = __str__
+    def ukey(self, path):
+        return tokenize(path, self.fo, self.protocol)
+    def _all_dirnames(self, paths):
+        """Returns *all* directory names for each path in paths, including intermediate
+        ones.
+        Parameters
+        ----------
+        paths: Iterable of path strings
+        """
+        if len(paths) == 0:
+            return set()
+        dirnames = {self._parent(path) for path in paths} - {self.root_marker}
+        return dirnames | self._all_dirnames(dirnames)
+    def info(self, path, **kwargs):
+        self._get_dirs()
+        path = self._strip_protocol(path)
+        if path in {"", "/"} and self.dir_cache:
+            return {"name": "", "type": "directory", "size": 0}
+        if path in self.dir_cache:
+            return self.dir_cache[path]
+        elif path + "/" in self.dir_cache:
+            return self.dir_cache[path + "/"]
+        else:
+            raise FileNotFoundError(path)
+    def ls(self, path, detail=True, **kwargs):
+        self._get_dirs()
+        paths = {}
+        for p, f in self.dir_cache.items():
+            p = p.rstrip("/")
+            if "/" in p:
+                root = p.rsplit("/", 1)[0]
+            else:
+                root = ""
+            if root == path.rstrip("/"):
+                paths[p] = f
+            elif all(
+                (a == b)
+                for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
+            ):
+                # root directory entry
+                ppath = p.rstrip("/").split("/", 1)[0]
+                if ppath not in paths:
+                    out = {"name": ppath, "size": 0, "type": "directory"}
+                    paths[ppath] = out
+        if detail:
+            out = sorted(paths.values(), key=operator.itemgetter("name"))
+            return out
+        else:
+            return sorted(paths)

.venv/lib/python3.11/site-packages/fsspec/asyn.py ADDED Viewed

	@@ -0,0 +1,1098 @@

+import asyncio
+import asyncio.events
+import functools
+import inspect
+import io
+import numbers
+import os
+import re
+import threading
+from contextlib import contextmanager
+from glob import has_magic
+from typing import TYPE_CHECKING, Iterable
+from .callbacks import DEFAULT_CALLBACK
+from .exceptions import FSTimeoutError
+from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
+from .spec import AbstractBufferedFile, AbstractFileSystem
+from .utils import glob_translate, is_exception, other_paths
+private = re.compile("_[^_]")
+iothread = [None]  # dedicated fsspec IO thread
+loop = [None]  # global event loop for any non-async instance
+_lock = None  # global lock placeholder
+get_running_loop = asyncio.get_running_loop
+def get_lock():
+    """Allocate or return a threading lock.
+    The lock is allocated on first use to allow setting one lock per forked process.
+    """
+    global _lock
+    if not _lock:
+        _lock = threading.Lock()
+    return _lock
+def reset_lock():
+    """Reset the global lock.
+    This should be called only on the init of a forked process to reset the lock to
+    None, enabling the new forked process to get a new lock.
+    """
+    global _lock
+    iothread[0] = None
+    loop[0] = None
+    _lock = None
+async def _runner(event, coro, result, timeout=None):
+    timeout = timeout if timeout else None  # convert 0 or 0.0 to None
+    if timeout is not None:
+        coro = asyncio.wait_for(coro, timeout=timeout)
+    try:
+        result[0] = await coro
+    except Exception as ex:
+        result[0] = ex
+    finally:
+        event.set()
+def sync(loop, func, *args, timeout=None, **kwargs):
+    """
+    Make loop run coroutine until it returns. Runs in other thread
+    Examples
+    --------
+    >>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args,
+                         timeout=timeout, **kwargs)
+    """
+    timeout = timeout if timeout else None  # convert 0 or 0.0 to None
+    # NB: if the loop is not running *yet*, it is OK to submit work
+    # and we will wait for it
+    if loop is None or loop.is_closed():
+        raise RuntimeError("Loop is not running")
+    try:
+        loop0 = asyncio.events.get_running_loop()
+        if loop0 is loop:
+            raise NotImplementedError("Calling sync() from within a running loop")
+    except NotImplementedError:
+        raise
+    except RuntimeError:
+        pass
+    coro = func(*args, **kwargs)
+    result = [None]
+    event = threading.Event()
+    asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop)
+    while True:
+        # this loops allows thread to get interrupted
+        if event.wait(1):
+            break
+        if timeout is not None:
+            timeout -= 1
+            if timeout < 0:
+                raise FSTimeoutError
+    return_result = result[0]
+    if isinstance(return_result, asyncio.TimeoutError):
+        # suppress asyncio.TimeoutError, raise FSTimeoutError
+        raise FSTimeoutError from return_result
+    elif isinstance(return_result, BaseException):
+        raise return_result
+    else:
+        return return_result
+def sync_wrapper(func, obj=None):
+    """Given a function, make so can be called in blocking contexts
+    Leave obj=None if defining within a class. Pass the instance if attaching
+    as an attribute of the instance.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        self = obj or args[0]
+        return sync(self.loop, func, *args, **kwargs)
+    return wrapper
+@contextmanager
+def _selector_policy():
+    original_policy = asyncio.get_event_loop_policy()
+    try:
+        if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
+            asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+        yield
+    finally:
+        asyncio.set_event_loop_policy(original_policy)
+def get_loop():
+    """Create or return the default fsspec IO loop
+    The loop will be running on a separate thread.
+    """
+    if loop[0] is None:
+        with get_lock():
+            # repeat the check just in case the loop got filled between the
+            # previous two calls from another thread
+            if loop[0] is None:
+                with _selector_policy():
+                    loop[0] = asyncio.new_event_loop()
+                th = threading.Thread(target=loop[0].run_forever, name="fsspecIO")
+                th.daemon = True
+                th.start()
+                iothread[0] = th
+    return loop[0]
+if TYPE_CHECKING:
+    import resource
+    ResourceError = resource.error
+else:
+    try:
+        import resource
+    except ImportError:
+        resource = None
+        ResourceError = OSError
+    else:
+        ResourceError = getattr(resource, "error", OSError)
+_DEFAULT_BATCH_SIZE = 128
+_NOFILES_DEFAULT_BATCH_SIZE = 1280
+def _get_batch_size(nofiles=False):
+    from fsspec.config import conf
+    if nofiles:
+        if "nofiles_gather_batch_size" in conf:
+            return conf["nofiles_gather_batch_size"]
+    else:
+        if "gather_batch_size" in conf:
+            return conf["gather_batch_size"]
+    if nofiles:
+        return _NOFILES_DEFAULT_BATCH_SIZE
+    if resource is None:
+        return _DEFAULT_BATCH_SIZE
+    try:
+        soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+    except (ImportError, ValueError, ResourceError):
+        return _DEFAULT_BATCH_SIZE
+    if soft_limit == resource.RLIM_INFINITY:
+        return -1
+    else:
+        return soft_limit // 8
+def running_async() -> bool:
+    """Being executed by an event loop?"""
+    try:
+        asyncio.get_running_loop()
+        return True
+    except RuntimeError:
+        return False
+async def _run_coros_in_chunks(
+    coros,
+    batch_size=None,
+    callback=DEFAULT_CALLBACK,
+    timeout=None,
+    return_exceptions=False,
+    nofiles=False,
+):
+    """Run the given coroutines in  chunks.
+    Parameters
+    ----------
+    coros: list of coroutines to run
+    batch_size: int or None
+        Number of coroutines to submit/wait on simultaneously.
+        If -1, then it will not be any throttling. If
+        None, it will be inferred from _get_batch_size()
+    callback: fsspec.callbacks.Callback instance
+        Gets a relative_update when each coroutine completes
+    timeout: number or None
+        If given, each coroutine times out after this time. Note that, since
+        there are multiple batches, the total run time of this function will in
+        general be longer
+    return_exceptions: bool
+        Same meaning as in asyncio.gather
+    nofiles: bool
+        If inferring the batch_size, does this operation involve local files?
+        If yes, you normally expect smaller batches.
+    """
+    if batch_size is None:
+        batch_size = _get_batch_size(nofiles=nofiles)
+    if batch_size == -1:
+        batch_size = len(coros)
+    assert batch_size > 0
+    async def _run_coro(coro, i):
+        try:
+            return await asyncio.wait_for(coro, timeout=timeout), i
+        except Exception as e:
+            if not return_exceptions:
+                raise
+            return e, i
+        finally:
+            callback.relative_update(1)
+    i = 0
+    n = len(coros)
+    results = [None] * n
+    pending = set()
+    while pending or i < n:
+        while len(pending) < batch_size and i < n:
+            pending.add(asyncio.ensure_future(_run_coro(coros[i], i)))
+            i += 1
+        if not pending:
+            break
+        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
+        while done:
+            result, k = await done.pop()
+            results[k] = result
+    return results
+# these methods should be implemented as async by any async-able backend
+async_methods = [
+    "_ls",
+    "_cat_file",
+    "_get_file",
+    "_put_file",
+    "_rm_file",
+    "_cp_file",
+    "_pipe_file",
+    "_expand_path",
+    "_info",
+    "_isfile",
+    "_isdir",
+    "_exists",
+    "_walk",
+    "_glob",
+    "_find",
+    "_du",
+    "_size",
+    "_mkdir",
+    "_makedirs",
+]
+class AsyncFileSystem(AbstractFileSystem):
+    """Async file operations, default implementations
+    Passes bulk operations to asyncio.gather for concurrent operation.
+    Implementations that have concurrent batch operations and/or async methods
+    should inherit from this class instead of AbstractFileSystem. Docstrings are
+    copied from the un-underscored method in AbstractFileSystem, if not given.
+    """
+    # note that methods do not have docstring here; they will be copied
+    # for _* methods and inferred for overridden methods.
+    async_impl = True
+    mirror_sync_methods = True
+    disable_throttling = False
+    def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
+        self.asynchronous = asynchronous
+        self._pid = os.getpid()
+        if not asynchronous:
+            self._loop = loop or get_loop()
+        else:
+            self._loop = None
+        self.batch_size = batch_size
+        super().__init__(*args, **kwargs)
+    @property
+    def loop(self):
+        if self._pid != os.getpid():
+            raise RuntimeError("This class is not fork-safe")
+        return self._loop
+    async def _rm_file(self, path, **kwargs):
+        raise NotImplementedError
+    async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
+        # TODO: implement on_error
+        batch_size = batch_size or self.batch_size
+        path = await self._expand_path(path, recursive=recursive)
+        return await _run_coros_in_chunks(
+            [self._rm_file(p, **kwargs) for p in reversed(path)],
+            batch_size=batch_size,
+            nofiles=True,
+        )
+    async def _cp_file(self, path1, path2, **kwargs):
+        raise NotImplementedError
+    async def _mv_file(self, path1, path2):
+        await self._cp_file(path1, path2)
+        await self._rm_file(path1)
+    async def _copy(
+        self,
+        path1,
+        path2,
+        recursive=False,
+        on_error=None,
+        maxdepth=None,
+        batch_size=None,
+        **kwargs,
+    ):
+        if on_error is None and recursive:
+            on_error = "ignore"
+        elif on_error is None:
+            on_error = "raise"
+        if isinstance(path1, list) and isinstance(path2, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            paths1 = path1
+            paths2 = path2
+        else:
+            source_is_str = isinstance(path1, str)
+            paths1 = await self._expand_path(
+                path1, maxdepth=maxdepth, recursive=recursive
+            )
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                paths1 = [
+                    p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
+                ]
+                if not paths1:
+                    return
+            source_is_file = len(paths1) == 1
+            dest_is_dir = isinstance(path2, str) and (
+                trailing_sep(path2) or await self._isdir(path2)
+            )
+            exists = source_is_str and (
+                (has_magic(path1) and source_is_file)
+                or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
+            )
+            paths2 = other_paths(
+                paths1,
+                path2,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        batch_size = batch_size or self.batch_size
+        coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
+        result = await _run_coros_in_chunks(
+            coros, batch_size=batch_size, return_exceptions=True, nofiles=True
+        )
+        for ex in filter(is_exception, result):
+            if on_error == "ignore" and isinstance(ex, FileNotFoundError):
+                continue
+            raise ex
+    async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
+        raise NotImplementedError
+    async def _pipe(self, path, value=None, batch_size=None, **kwargs):
+        if isinstance(path, str):
+            path = {path: value}
+        batch_size = batch_size or self.batch_size
+        return await _run_coros_in_chunks(
+            [self._pipe_file(k, v, **kwargs) for k, v in path.items()],
+            batch_size=batch_size,
+            nofiles=True,
+        )
+    async def _process_limits(self, url, start, end):
+        """Helper for "Range"-based _cat_file"""
+        size = None
+        suff = False
+        if start is not None and start < 0:
+            # if start is negative and end None, end is the "suffix length"
+            if end is None:
+                end = -start
+                start = ""
+                suff = True
+            else:
+                size = size or (await self._info(url))["size"]
+                start = size + start
+        elif start is None:
+            start = 0
+        if not suff:
+            if end is not None and end < 0:
+                if start is not None:
+                    size = size or (await self._info(url))["size"]
+                    end = size + end
+            elif end is None:
+                end = ""
+            if isinstance(end, numbers.Integral):
+                end -= 1  # bytes range is inclusive
+        return f"bytes={start}-{end}"
+    async def _cat_file(self, path, start=None, end=None, **kwargs):
+        raise NotImplementedError
+    async def _cat(
+        self, path, recursive=False, on_error="raise", batch_size=None, **kwargs
+    ):
+        paths = await self._expand_path(path, recursive=recursive)
+        coros = [self._cat_file(path, **kwargs) for path in paths]
+        batch_size = batch_size or self.batch_size
+        out = await _run_coros_in_chunks(
+            coros, batch_size=batch_size, nofiles=True, return_exceptions=True
+        )
+        if on_error == "raise":
+            ex = next(filter(is_exception, out), False)
+            if ex:
+                raise ex
+        if (
+            len(paths) > 1
+            or isinstance(path, list)
+            or paths[0] != self._strip_protocol(path)
+        ):
+            return {
+                k: v
+                for k, v in zip(paths, out)
+                if on_error != "omit" or not is_exception(v)
+            }
+        else:
+            return out[0]
+    async def _cat_ranges(
+        self,
+        paths,
+        starts,
+        ends,
+        max_gap=None,
+        batch_size=None,
+        on_error="return",
+        **kwargs,
+    ):
+        """Get the contents of byte ranges from one or more files
+        Parameters
+        ----------
+        paths: list
+            A list of of filepaths on this filesystems
+        starts, ends: int or list
+            Bytes limits of the read. If using a single int, the same value will be
+            used to read all the specified files.
+        """
+        # TODO: on_error
+        if max_gap is not None:
+            # use utils.merge_offset_ranges
+            raise NotImplementedError
+        if not isinstance(paths, list):
+            raise TypeError
+        if not isinstance(starts, Iterable):
+            starts = [starts] * len(paths)
+        if not isinstance(ends, Iterable):
+            ends = [ends] * len(paths)
+        if len(starts) != len(paths) or len(ends) != len(paths):
+            raise ValueError
+        coros = [
+            self._cat_file(p, start=s, end=e, **kwargs)
+            for p, s, e in zip(paths, starts, ends)
+        ]
+        batch_size = batch_size or self.batch_size
+        return await _run_coros_in_chunks(
+            coros, batch_size=batch_size, nofiles=True, return_exceptions=True
+        )
+    async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
+        raise NotImplementedError
+    async def _put(
+        self,
+        lpath,
+        rpath,
+        recursive=False,
+        callback=DEFAULT_CALLBACK,
+        batch_size=None,
+        maxdepth=None,
+        **kwargs,
+    ):
+        """Copy file(s) from local.
+        Copies a specific file or tree of files (if recursive=True). If rpath
+        ends with a "/", it will be assumed to be a directory, and target files
+        will go within.
+        The put_file method will be called concurrently on a batch of files. The
+        batch_size option can configure the amount of futures that can be executed
+        at the same time. If it is -1, then all the files will be uploaded concurrently.
+        The default can be set for this instance by passing "batch_size" in the
+        constructor, or for all instances by setting the "gather_batch_size" key
+        in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
+        """
+        if isinstance(lpath, list) and isinstance(rpath, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            rpaths = rpath
+            lpaths = lpath
+        else:
+            source_is_str = isinstance(lpath, str)
+            if source_is_str:
+                lpath = make_path_posix(lpath)
+            fs = LocalFileSystem()
+            lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
+                if not lpaths:
+                    return
+            source_is_file = len(lpaths) == 1
+            dest_is_dir = isinstance(rpath, str) and (
+                trailing_sep(rpath) or await self._isdir(rpath)
+            )
+            rpath = self._strip_protocol(rpath)
+            exists = source_is_str and (
+                (has_magic(lpath) and source_is_file)
+                or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
+            )
+            rpaths = other_paths(
+                lpaths,
+                rpath,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        is_dir = {l: os.path.isdir(l) for l in lpaths}
+        rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
+        file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]]
+        await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs])
+        batch_size = batch_size or self.batch_size
+        coros = []
+        callback.set_size(len(file_pairs))
+        for lfile, rfile in file_pairs:
+            put_file = callback.branch_coro(self._put_file)
+            coros.append(put_file(lfile, rfile, **kwargs))
+        return await _run_coros_in_chunks(
+            coros, batch_size=batch_size, callback=callback
+        )
+    async def _get_file(self, rpath, lpath, **kwargs):
+        raise NotImplementedError
+    async def _get(
+        self,
+        rpath,
+        lpath,
+        recursive=False,
+        callback=DEFAULT_CALLBACK,
+        maxdepth=None,
+        **kwargs,
+    ):
+        """Copy file(s) to local.
+        Copies a specific file or tree of files (if recursive=True). If lpath
+        ends with a "/", it will be assumed to be a directory, and target files
+        will go within. Can submit a list of paths, which may be glob-patterns
+        and will be expanded.
+        The get_file method will be called concurrently on a batch of files. The
+        batch_size option can configure the amount of futures that can be executed
+        at the same time. If it is -1, then all the files will be uploaded concurrently.
+        The default can be set for this instance by passing "batch_size" in the
+        constructor, or for all instances by setting the "gather_batch_size" key
+        in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
+        """
+        if isinstance(lpath, list) and isinstance(rpath, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            rpaths = rpath
+            lpaths = lpath
+        else:
+            source_is_str = isinstance(rpath, str)
+            # First check for rpath trailing slash as _strip_protocol removes it.
+            source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
+            rpath = self._strip_protocol(rpath)
+            rpaths = await self._expand_path(
+                rpath, recursive=recursive, maxdepth=maxdepth
+            )
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                rpaths = [
+                    p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
+                ]
+                if not rpaths:
+                    return
+            lpath = make_path_posix(lpath)
+            source_is_file = len(rpaths) == 1
+            dest_is_dir = isinstance(lpath, str) and (
+                trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
+            )
+            exists = source_is_str and (
+                (has_magic(rpath) and source_is_file)
+                or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
+            )
+            lpaths = other_paths(
+                rpaths,
+                lpath,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
+        batch_size = kwargs.pop("batch_size", self.batch_size)
+        coros = []
+        callback.set_size(len(lpaths))
+        for lpath, rpath in zip(lpaths, rpaths):
+            get_file = callback.branch_coro(self._get_file)
+            coros.append(get_file(rpath, lpath, **kwargs))
+        return await _run_coros_in_chunks(
+            coros, batch_size=batch_size, callback=callback
+        )
+    async def _isfile(self, path):
+        try:
+            return (await self._info(path))["type"] == "file"
+        except:  # noqa: E722
+            return False
+    async def _isdir(self, path):
+        try:
+            return (await self._info(path))["type"] == "directory"
+        except OSError:
+            return False
+    async def _size(self, path):
+        return (await self._info(path)).get("size", None)
+    async def _sizes(self, paths, batch_size=None):
+        batch_size = batch_size or self.batch_size
+        return await _run_coros_in_chunks(
+            [self._size(p) for p in paths], batch_size=batch_size
+        )
+    async def _exists(self, path, **kwargs):
+        try:
+            await self._info(path, **kwargs)
+            return True
+        except FileNotFoundError:
+            return False
+    async def _info(self, path, **kwargs):
+        raise NotImplementedError
+    async def _ls(self, path, detail=True, **kwargs):
+        raise NotImplementedError
+    async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        path = self._strip_protocol(path)
+        full_dirs = {}
+        dirs = {}
+        files = {}
+        detail = kwargs.pop("detail", False)
+        try:
+            listing = await self._ls(path, detail=True, **kwargs)
+        except (FileNotFoundError, OSError) as e:
+            if on_error == "raise":
+                raise
+            elif callable(on_error):
+                on_error(e)
+            if detail:
+                yield path, {}, {}
+            else:
+                yield path, [], []
+            return
+        for info in listing:
+            # each info name must be at least [path]/part , but here
+            # we check also for names like [path]/part/
+            pathname = info["name"].rstrip("/")
+            name = pathname.rsplit("/", 1)[-1]
+            if info["type"] == "directory" and pathname != path:
+                # do not include "self" path
+                full_dirs[name] = pathname
+                dirs[name] = info
+            elif pathname == path:
+                # file-like with same name as give path
+                files[""] = info
+            else:
+                files[name] = info
+        if detail:
+            yield path, dirs, files
+        else:
+            yield path, list(dirs), list(files)
+        if maxdepth is not None:
+            maxdepth -= 1
+            if maxdepth < 1:
+                return
+        for d in dirs:
+            async for _ in self._walk(
+                full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs
+            ):
+                yield _
+    async def _glob(self, path, maxdepth=None, **kwargs):
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        import re
+        seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
+        ends_with_sep = path.endswith(seps)  # _strip_protocol strips trailing slash
+        path = self._strip_protocol(path)
+        append_slash_to_dirname = ends_with_sep or path.endswith(
+            tuple(sep + "**" for sep in seps)
+        )
+        idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+        idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
+        idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+        min_idx = min(idx_star, idx_qmark, idx_brace)
+        detail = kwargs.pop("detail", False)
+        if not has_magic(path):
+            if await self._exists(path, **kwargs):
+                if not detail:
+                    return [path]
+                else:
+                    return {path: await self._info(path, **kwargs)}
+            else:
+                if not detail:
+                    return []  # glob of non-existent returns empty
+                else:
+                    return {}
+        elif "/" in path[:min_idx]:
+            min_idx = path[:min_idx].rindex("/")
+            root = path[: min_idx + 1]
+            depth = path[min_idx + 1 :].count("/") + 1
+        else:
+            root = ""
+            depth = path[min_idx + 1 :].count("/") + 1
+        if "**" in path:
+            if maxdepth is not None:
+                idx_double_stars = path.find("**")
+                depth_double_stars = path[idx_double_stars:].count("/") + 1
+                depth = depth - depth_double_stars + maxdepth
+            else:
+                depth = None
+        allpaths = await self._find(
+            root, maxdepth=depth, withdirs=True, detail=True, **kwargs
+        )
+        pattern = glob_translate(path + ("/" if ends_with_sep else ""))
+        pattern = re.compile(pattern)
+        out = {
+            p: info
+            for p, info in sorted(allpaths.items())
+            if pattern.match(
+                p + "/"
+                if append_slash_to_dirname and info["type"] == "directory"
+                else p
+            )
+        }
+        if detail:
+            return out
+        else:
+            return list(out)
+    async def _du(self, path, total=True, maxdepth=None, **kwargs):
+        sizes = {}
+        # async for?
+        for f in await self._find(path, maxdepth=maxdepth, **kwargs):
+            info = await self._info(f)
+            sizes[info["name"]] = info["size"]
+        if total:
+            return sum(sizes.values())
+        else:
+            return sizes
+    async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
+        path = self._strip_protocol(path)
+        out = {}
+        detail = kwargs.pop("detail", False)
+        # Add the root directory if withdirs is requested
+        # This is needed for posix glob compliance
+        if withdirs and path != "" and await self._isdir(path):
+            out[path] = await self._info(path)
+        # async for?
+        async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
+            if withdirs:
+                files.update(dirs)
+            out.update({info["name"]: info for name, info in files.items()})
+        if not out and (await self._isfile(path)):
+            # walk works on directories, but find should also return [path]
+            # when path happens to be a file
+            out[path] = {}
+        names = sorted(out)
+        if not detail:
+            return names
+        else:
+            return {name: out[name] for name in names}
+    async def _expand_path(self, path, recursive=False, maxdepth=None):
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        if isinstance(path, str):
+            out = await self._expand_path([path], recursive, maxdepth)
+        else:
+            out = set()
+            path = [self._strip_protocol(p) for p in path]
+            for p in path:  # can gather here
+                if has_magic(p):
+                    bit = set(await self._glob(p, maxdepth=maxdepth))
+                    out |= bit
+                    if recursive:
+                        # glob call above expanded one depth so if maxdepth is defined
+                        # then decrement it in expand_path call below. If it is zero
+                        # after decrementing then avoid expand_path call.
+                        if maxdepth is not None and maxdepth <= 1:
+                            continue
+                        out |= set(
+                            await self._expand_path(
+                                list(bit),
+                                recursive=recursive,
+                                maxdepth=maxdepth - 1 if maxdepth is not None else None,
+                            )
+                        )
+                    continue
+                elif recursive:
+                    rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True))
+                    out |= rec
+                if p not in out and (recursive is False or (await self._exists(p))):
+                    # should only check once, for the root
+                    out.add(p)
+        if not out:
+            raise FileNotFoundError(path)
+        return sorted(out)
+    async def _mkdir(self, path, create_parents=True, **kwargs):
+        pass  # not necessary to implement, may not have directories
+    async def _makedirs(self, path, exist_ok=False):
+        pass  # not necessary to implement, may not have directories
+    async def open_async(self, path, mode="rb", **kwargs):
+        if "b" not in mode or kwargs.get("compression"):
+            raise ValueError
+        raise NotImplementedError
+def mirror_sync_methods(obj):
+    """Populate sync and async methods for obj
+    For each method will create a sync version if the name refers to an async method
+    (coroutine) and there is no override in the child class; will create an async
+    method for the corresponding sync method if there is no implementation.
+    Uses the methods specified in
+    - async_methods: the set that an implementation is expected to provide
+    - default_async_methods: that can be derived from their sync version in
+      AbstractFileSystem
+    - AsyncFileSystem: async-specific default coroutines
+    """
+    from fsspec import AbstractFileSystem
+    for method in async_methods + dir(AsyncFileSystem):
+        if not method.startswith("_"):
+            continue
+        smethod = method[1:]
+        if private.match(method):
+            isco = inspect.iscoroutinefunction(getattr(obj, method, None))
+            unsync = getattr(getattr(obj, smethod, False), "__func__", None)
+            is_default = unsync is getattr(AbstractFileSystem, smethod, "")
+            if isco and is_default:
+                mth = sync_wrapper(getattr(obj, method), obj=obj)
+                setattr(obj, smethod, mth)
+                if not mth.__doc__:
+                    mth.__doc__ = getattr(
+                        getattr(AbstractFileSystem, smethod, None), "__doc__", ""
+                    )
+class FSSpecCoroutineCancel(Exception):
+    pass
+def _dump_running_tasks(
+    printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False
+):
+    import traceback
+    tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()]
+    if printout:
+        [task.print_stack() for task in tasks]
+    out = [
+        {
+            "locals": task._coro.cr_frame.f_locals,
+            "file": task._coro.cr_frame.f_code.co_filename,
+            "firstline": task._coro.cr_frame.f_code.co_firstlineno,
+            "linelo": task._coro.cr_frame.f_lineno,
+            "stack": traceback.format_stack(task._coro.cr_frame),
+            "task": task if with_task else None,
+        }
+        for task in tasks
+    ]
+    if cancel:
+        for t in tasks:
+            cbs = t._callbacks
+            t.cancel()
+            asyncio.futures.Future.set_exception(t, exc)
+            asyncio.futures.Future.cancel(t)
+            [cb[0](t) for cb in cbs]  # cancels any dependent concurrent.futures
+            try:
+                t._coro.throw(exc)  # exits coro, unless explicitly handled
+            except exc:
+                pass
+    return out
+class AbstractAsyncStreamedFile(AbstractBufferedFile):
+    # no read buffering, and always auto-commit
+    # TODO: readahead might still be useful here, but needs async version
+    async def read(self, length=-1):
+        """
+        Return data from cache, or fetch pieces as necessary
+        Parameters
+        ----------
+        length: int (-1)
+            Number of bytes to read; if <0, all remaining bytes.
+        """
+        length = -1 if length is None else int(length)
+        if self.mode != "rb":
+            raise ValueError("File not in read mode")
+        if length < 0:
+            length = self.size - self.loc
+        if self.closed:
+            raise ValueError("I/O operation on closed file.")
+        if length == 0:
+            # don't even bother calling fetch
+            return b""
+        out = await self._fetch_range(self.loc, self.loc + length)
+        self.loc += len(out)
+        return out
+    async def write(self, data):
+        """
+        Write data to buffer.
+        Buffer only sent on flush() or if buffer is greater than
+        or equal to blocksize.
+        Parameters
+        ----------
+        data: bytes
+            Set of bytes to be written.
+        """
+        if self.mode not in {"wb", "ab"}:
+            raise ValueError("File not in write mode")
+        if self.closed:
+            raise ValueError("I/O operation on closed file.")
+        if self.forced:
+            raise ValueError("This file has been force-flushed, can only close")
+        out = self.buffer.write(data)
+        self.loc += out
+        if self.buffer.tell() >= self.blocksize:
+            await self.flush()
+        return out
+    async def close(self):
+        """Close file
+        Finalizes writes, discards cache
+        """
+        if getattr(self, "_unclosable", False):
+            return
+        if self.closed:
+            return
+        if self.mode == "rb":
+            self.cache = None
+        else:
+            if not self.forced:
+                await self.flush(force=True)
+            if self.fs is not None:
+                self.fs.invalidate_cache(self.path)
+                self.fs.invalidate_cache(self.fs._parent(self.path))
+        self.closed = True
+    async def flush(self, force=False):
+        if self.closed:
+            raise ValueError("Flush on closed file")
+        if force and self.forced:
+            raise ValueError("Force flush cannot be called more than once")
+        if force:
+            self.forced = True
+        if self.mode not in {"wb", "ab"}:
+            # no-op to flush on read-mode
+            return
+        if not force and self.buffer.tell() < self.blocksize:
+            # Defer write on small block
+            return
+        if self.offset is None:
+            # Initialize a multipart upload
+            self.offset = 0
+            try:
+                await self._initiate_upload()
+            except:
+                self.closed = True
+                raise
+        if await self._upload_chunk(final=force) is not False:
+            self.offset += self.buffer.seek(0, 2)
+            self.buffer = io.BytesIO()
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+    async def _fetch_range(self, start, end):
+        raise NotImplementedError
+    async def _initiate_upload(self):
+        pass
+    async def _upload_chunk(self, final=False):
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/fsspec/caching.py ADDED Viewed

	@@ -0,0 +1,966 @@

+from __future__ import annotations
+import collections
+import functools
+import logging
+import math
+import os
+import threading
+import warnings
+from concurrent.futures import Future, ThreadPoolExecutor
+from itertools import groupby
+from operator import itemgetter
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Generic,
+    NamedTuple,
+    Optional,
+    OrderedDict,
+    TypeVar,
+)
+if TYPE_CHECKING:
+    import mmap
+    from typing_extensions import ParamSpec
+    P = ParamSpec("P")
+else:
+    P = TypeVar("P")
+T = TypeVar("T")
+logger = logging.getLogger("fsspec")
+Fetcher = Callable[[int, int], bytes]  # Maps (start, end) to bytes
+class BaseCache:
+    """Pass-though cache: doesn't keep anything, calls every time
+    Acts as base class for other cachers
+    Parameters
+    ----------
+    blocksize: int
+        How far to read ahead in numbers of bytes
+    fetcher: func
+        Function of the form f(start, end) which gets bytes from remote as
+        specified
+    size: int
+        How big this file is
+    """
+    name: ClassVar[str] = "none"
+    def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
+        self.blocksize = blocksize
+        self.nblocks = 0
+        self.fetcher = fetcher
+        self.size = size
+        self.hit_count = 0
+        self.miss_count = 0
+        # the bytes that we actually requested
+        self.total_requested_bytes = 0
+    def _fetch(self, start: int | None, stop: int | None) -> bytes:
+        if start is None:
+            start = 0
+        if stop is None:
+            stop = self.size
+        if start >= self.size or start >= stop:
+            return b""
+        return self.fetcher(start, stop)
+    def _reset_stats(self) -> None:
+        """Reset hit and miss counts for a more ganular report e.g. by file."""
+        self.hit_count = 0
+        self.miss_count = 0
+        self.total_requested_bytes = 0
+    def _log_stats(self) -> str:
+        """Return a formatted string of the cache statistics."""
+        if self.hit_count == 0 and self.miss_count == 0:
+            # a cache that does nothing, this is for logs only
+            return ""
+        return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
+    def __repr__(self) -> str:
+        # TODO: use rich for better formatting
+        return f"""
+        <{self.__class__.__name__}:
+            block size  :   {self.blocksize}
+            block count :   {self.nblocks}
+            file size   :   {self.size}
+            cache hits  :   {self.hit_count}
+            cache misses:   {self.miss_count}
+            total requested bytes: {self.total_requested_bytes}>
+        """
+class MMapCache(BaseCache):
+    """memory-mapped sparse file cache
+    Opens temporary file, which is filled blocks-wise when data is requested.
+    Ensure there is enough disc space in the temporary location.
+    This cache method might only work on posix
+    """
+    name = "mmap"
+    def __init__(
+        self,
+        blocksize: int,
+        fetcher: Fetcher,
+        size: int,
+        location: str | None = None,
+        blocks: set[int] | None = None,
+    ) -> None:
+        super().__init__(blocksize, fetcher, size)
+        self.blocks = set() if blocks is None else blocks
+        self.location = location
+        self.cache = self._makefile()
+    def _makefile(self) -> mmap.mmap | bytearray:
+        import mmap
+        import tempfile
+        if self.size == 0:
+            return bytearray()
+        # posix version
+        if self.location is None or not os.path.exists(self.location):
+            if self.location is None:
+                fd = tempfile.TemporaryFile()
+                self.blocks = set()
+            else:
+                fd = open(self.location, "wb+")
+            fd.seek(self.size - 1)
+            fd.write(b"1")
+            fd.flush()
+        else:
+            fd = open(self.location, "r+b")
+        return mmap.mmap(fd.fileno(), self.size)
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        logger.debug(f"MMap cache fetching {start}-{end}")
+        if start is None:
+            start = 0
+        if end is None:
+            end = self.size
+        if start >= self.size or start >= end:
+            return b""
+        start_block = start // self.blocksize
+        end_block = end // self.blocksize
+        block_range = range(start_block, end_block + 1)
+        # Determine which blocks need to be fetched. This sequence is sorted by construction.
+        need = (i for i in block_range if i not in self.blocks)
+        # Count the number of blocks already cached
+        self.hit_count += sum(1 for i in block_range if i in self.blocks)
+        # Consolidate needed blocks.
+        # Algorithm adapted from Python 2.x itertools documentation.
+        # We are grouping an enumerated sequence of blocks. By comparing when the difference
+        # between an ascending range (provided by enumerate) and the needed block numbers
+        # we can detect when the block number skips values. The key computes this difference.
+        # Whenever the difference changes, we know that we have previously cached block(s),
+        # and a new group is started. In other words, this algorithm neatly groups
+        # runs of consecutive block numbers so they can be fetched together.
+        for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
+            # Extract the blocks from the enumerated sequence
+            _blocks = tuple(map(itemgetter(1), _blocks))
+            # Compute start of first block
+            sstart = _blocks[0] * self.blocksize
+            # Compute the end of the last block. Last block may not be full size.
+            send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
+            # Fetch bytes (could be multiple consecutive blocks)
+            self.total_requested_bytes += send - sstart
+            logger.debug(
+                f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
+            )
+            self.cache[sstart:send] = self.fetcher(sstart, send)
+            # Update set of cached blocks
+            self.blocks.update(_blocks)
+            # Update cache statistics with number of blocks we had to cache
+            self.miss_count += len(_blocks)
+        return self.cache[start:end]
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        # Remove the unpicklable entries.
+        del state["cache"]
+        return state
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        # Restore instance attributes
+        self.__dict__.update(state)
+        self.cache = self._makefile()
+class ReadAheadCache(BaseCache):
+    """Cache which reads only when we get beyond a block of data
+    This is a much simpler version of BytesCache, and does not attempt to
+    fill holes in the cache or keep fragments alive. It is best suited to
+    many small reads in a sequential order (e.g., reading lines from a file).
+    """
+    name = "readahead"
+    def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
+        super().__init__(blocksize, fetcher, size)
+        self.cache = b""
+        self.start = 0
+        self.end = 0
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        if start is None:
+            start = 0
+        if end is None or end > self.size:
+            end = self.size
+        if start >= self.size or start >= end:
+            return b""
+        l = end - start
+        if start >= self.start and end <= self.end:
+            # cache hit
+            self.hit_count += 1
+            return self.cache[start - self.start : end - self.start]
+        elif self.start <= start < self.end:
+            # partial hit
+            self.miss_count += 1
+            part = self.cache[start - self.start :]
+            l -= len(part)
+            start = self.end
+        else:
+            # miss
+            self.miss_count += 1
+            part = b""
+        end = min(self.size, end + self.blocksize)
+        self.total_requested_bytes += end - start
+        self.cache = self.fetcher(start, end)  # new block replaces old
+        self.start = start
+        self.end = self.start + len(self.cache)
+        return part + self.cache[:l]
+class FirstChunkCache(BaseCache):
+    """Caches the first block of a file only
+    This may be useful for file types where the metadata is stored in the header,
+    but is randomly accessed.
+    """
+    name = "first"
+    def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
+        if blocksize > size:
+            # this will buffer the whole thing
+            blocksize = size
+        super().__init__(blocksize, fetcher, size)
+        self.cache: bytes | None = None
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        start = start or 0
+        if start > self.size:
+            logger.debug("FirstChunkCache: requested start > file size")
+            return b""
+        end = min(end, self.size)
+        if start < self.blocksize:
+            if self.cache is None:
+                self.miss_count += 1
+                if end > self.blocksize:
+                    self.total_requested_bytes += end
+                    data = self.fetcher(0, end)
+                    self.cache = data[: self.blocksize]
+                    return data[start:]
+                self.cache = self.fetcher(0, self.blocksize)
+                self.total_requested_bytes += self.blocksize
+            part = self.cache[start:end]
+            if end > self.blocksize:
+                self.total_requested_bytes += end - self.blocksize
+                part += self.fetcher(self.blocksize, end)
+            self.hit_count += 1
+            return part
+        else:
+            self.miss_count += 1
+            self.total_requested_bytes += end - start
+            return self.fetcher(start, end)
+class BlockCache(BaseCache):
+    """
+    Cache holding memory as a set of blocks.
+    Requests are only ever made ``blocksize`` at a time, and are
+    stored in an LRU cache. The least recently accessed block is
+    discarded when more than ``maxblocks`` are stored.
+    Parameters
+    ----------
+    blocksize : int
+        The number of bytes to store in each block.
+        Requests are only ever made for ``blocksize``, so this
+        should balance the overhead of making a request against
+        the granularity of the blocks.
+    fetcher : Callable
+    size : int
+        The total size of the file being cached.
+    maxblocks : int
+        The maximum number of blocks to cache for. The maximum memory
+        use for this cache is then ``blocksize * maxblocks``.
+    """
+    name = "blockcache"
+    def __init__(
+        self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
+    ) -> None:
+        super().__init__(blocksize, fetcher, size)
+        self.nblocks = math.ceil(size / blocksize)
+        self.maxblocks = maxblocks
+        self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
+    def cache_info(self):
+        """
+        The statistics on the block cache.
+        Returns
+        -------
+        NamedTuple
+            Returned directly from the LRU Cache used internally.
+        """
+        return self._fetch_block_cached.cache_info()
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__
+        del state["_fetch_block_cached"]
+        return state
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self.__dict__.update(state)
+        self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
+            self._fetch_block
+        )
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        if start is None:
+            start = 0
+        if end is None:
+            end = self.size
+        if start >= self.size or start >= end:
+            return b""
+        # byte position -> block numbers
+        start_block_number = start // self.blocksize
+        end_block_number = end // self.blocksize
+        # these are cached, so safe to do multiple calls for the same start and end.
+        for block_number in range(start_block_number, end_block_number + 1):
+            self._fetch_block_cached(block_number)
+        return self._read_cache(
+            start,
+            end,
+            start_block_number=start_block_number,
+            end_block_number=end_block_number,
+        )
+    def _fetch_block(self, block_number: int) -> bytes:
+        """
+        Fetch the block of data for `block_number`.
+        """
+        if block_number > self.nblocks:
+            raise ValueError(
+                f"'block_number={block_number}' is greater than "
+                f"the number of blocks ({self.nblocks})"
+            )
+        start = block_number * self.blocksize
+        end = start + self.blocksize
+        self.total_requested_bytes += end - start
+        self.miss_count += 1
+        logger.info("BlockCache fetching block %d", block_number)
+        block_contents = super()._fetch(start, end)
+        return block_contents
+    def _read_cache(
+        self, start: int, end: int, start_block_number: int, end_block_number: int
+    ) -> bytes:
+        """
+        Read from our block cache.
+        Parameters
+        ----------
+        start, end : int
+            The start and end byte positions.
+        start_block_number, end_block_number : int
+            The start and end block numbers.
+        """
+        start_pos = start % self.blocksize
+        end_pos = end % self.blocksize
+        self.hit_count += 1
+        if start_block_number == end_block_number:
+            block: bytes = self._fetch_block_cached(start_block_number)
+            return block[start_pos:end_pos]
+        else:
+            # read from the initial
+            out = [self._fetch_block_cached(start_block_number)[start_pos:]]
+            # intermediate blocks
+            # Note: it'd be nice to combine these into one big request. However
+            # that doesn't play nicely with our LRU cache.
+            out.extend(
+                map(
+                    self._fetch_block_cached,
+                    range(start_block_number + 1, end_block_number),
+                )
+            )
+            # final block
+            out.append(self._fetch_block_cached(end_block_number)[:end_pos])
+            return b"".join(out)
+class BytesCache(BaseCache):
+    """Cache which holds data in a in-memory bytes object
+    Implements read-ahead by the block size, for semi-random reads progressing
+    through the file.
+    Parameters
+    ----------
+    trim: bool
+        As we read more data, whether to discard the start of the buffer when
+        we are more than a blocksize ahead of it.
+    """
+    name: ClassVar[str] = "bytes"
+    def __init__(
+        self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
+    ) -> None:
+        super().__init__(blocksize, fetcher, size)
+        self.cache = b""
+        self.start: int | None = None
+        self.end: int | None = None
+        self.trim = trim
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        # TODO: only set start/end after fetch, in case it fails?
+        # is this where retry logic might go?
+        if start is None:
+            start = 0
+        if end is None:
+            end = self.size
+        if start >= self.size or start >= end:
+            return b""
+        if (
+            self.start is not None
+            and start >= self.start
+            and self.end is not None
+            and end < self.end
+        ):
+            # cache hit: we have all the required data
+            offset = start - self.start
+            self.hit_count += 1
+            return self.cache[offset : offset + end - start]
+        if self.blocksize:
+            bend = min(self.size, end + self.blocksize)
+        else:
+            bend = end
+        if bend == start or start > self.size:
+            return b""
+        if (self.start is None or start < self.start) and (
+            self.end is None or end > self.end
+        ):
+            # First read, or extending both before and after
+            self.total_requested_bytes += bend - start
+            self.miss_count += 1
+            self.cache = self.fetcher(start, bend)
+            self.start = start
+        else:
+            assert self.start is not None
+            assert self.end is not None
+            self.miss_count += 1
+            if start < self.start:
+                if self.end is None or self.end - end > self.blocksize:
+                    self.total_requested_bytes += bend - start
+                    self.cache = self.fetcher(start, bend)
+                    self.start = start
+                else:
+                    self.total_requested_bytes += self.start - start
+                    new = self.fetcher(start, self.start)
+                    self.start = start
+                    self.cache = new + self.cache
+            elif self.end is not None and bend > self.end:
+                if self.end > self.size:
+                    pass
+                elif end - self.end > self.blocksize:
+                    self.total_requested_bytes += bend - start
+                    self.cache = self.fetcher(start, bend)
+                    self.start = start
+                else:
+                    self.total_requested_bytes += bend - self.end
+                    new = self.fetcher(self.end, bend)
+                    self.cache = self.cache + new
+        self.end = self.start + len(self.cache)
+        offset = start - self.start
+        out = self.cache[offset : offset + end - start]
+        if self.trim:
+            num = (self.end - self.start) // (self.blocksize + 1)
+            if num > 1:
+                self.start += self.blocksize * num
+                self.cache = self.cache[self.blocksize * num :]
+        return out
+    def __len__(self) -> int:
+        return len(self.cache)
+class AllBytes(BaseCache):
+    """Cache entire contents of the file"""
+    name: ClassVar[str] = "all"
+    def __init__(
+        self,
+        blocksize: int | None = None,
+        fetcher: Fetcher | None = None,
+        size: int | None = None,
+        data: bytes | None = None,
+    ) -> None:
+        super().__init__(blocksize, fetcher, size)  # type: ignore[arg-type]
+        if data is None:
+            self.miss_count += 1
+            self.total_requested_bytes += self.size
+            data = self.fetcher(0, self.size)
+        self.data = data
+    def _fetch(self, start: int | None, stop: int | None) -> bytes:
+        self.hit_count += 1
+        return self.data[start:stop]
+class KnownPartsOfAFile(BaseCache):
+    """
+    Cache holding known file parts.
+    Parameters
+    ----------
+    blocksize: int
+        How far to read ahead in numbers of bytes
+    fetcher: func
+        Function of the form f(start, end) which gets bytes from remote as
+        specified
+    size: int
+        How big this file is
+    data: dict
+        A dictionary mapping explicit `(start, stop)` file-offset tuples
+        with known bytes.
+    strict: bool, default True
+        Whether to fetch reads that go beyond a known byte-range boundary.
+        If `False`, any read that ends outside a known part will be zero
+        padded. Note that zero padding will not be used for reads that
+        begin outside a known byte-range.
+    """
+    name: ClassVar[str] = "parts"
+    def __init__(
+        self,
+        blocksize: int,
+        fetcher: Fetcher,
+        size: int,
+        data: Optional[dict[tuple[int, int], bytes]] = None,
+        strict: bool = True,
+        **_: Any,
+    ):
+        super().__init__(blocksize, fetcher, size)
+        self.strict = strict
+        # simple consolidation of contiguous blocks
+        if data:
+            old_offsets = sorted(data.keys())
+            offsets = [old_offsets[0]]
+            blocks = [data.pop(old_offsets[0])]
+            for start, stop in old_offsets[1:]:
+                start0, stop0 = offsets[-1]
+                if start == stop0:
+                    offsets[-1] = (start0, stop)
+                    blocks[-1] += data.pop((start, stop))
+                else:
+                    offsets.append((start, stop))
+                    blocks.append(data.pop((start, stop)))
+            self.data = dict(zip(offsets, blocks))
+        else:
+            self.data = {}
+    def _fetch(self, start: int | None, stop: int | None) -> bytes:
+        if start is None:
+            start = 0
+        if stop is None:
+            stop = self.size
+        out = b""
+        for (loc0, loc1), data in self.data.items():
+            # If self.strict=False, use zero-padded data
+            # for reads beyond the end of a "known" buffer
+            if loc0 <= start < loc1:
+                off = start - loc0
+                out = data[off : off + stop - start]
+                if not self.strict or loc0 <= stop <= loc1:
+                    # The request is within a known range, or
+                    # it begins within a known range, and we
+                    # are allowed to pad reads beyond the
+                    # buffer with zero
+                    out += b"\x00" * (stop - start - len(out))
+                    self.hit_count += 1
+                    return out
+                else:
+                    # The request ends outside a known range,
+                    # and we are being "strict" about reads
+                    # beyond the buffer
+                    start = loc1
+                    break
+        # We only get here if there is a request outside the
+        # known parts of the file. In an ideal world, this
+        # should never happen
+        if self.fetcher is None:
+            # We cannot fetch the data, so raise an error
+            raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
+        # We can fetch the data, but should warn the user
+        # that this may be slow
+        warnings.warn(
+            f"Read is outside the known file parts: {(start, stop)}. "
+            f"IO/caching performance may be poor!"
+        )
+        logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
+        self.total_requested_bytes += stop - start
+        self.miss_count += 1
+        return out + super()._fetch(start, stop)
+class UpdatableLRU(Generic[P, T]):
+    """
+    Custom implementation of LRU cache that allows updating keys
+    Used by BackgroudBlockCache
+    """
+    class CacheInfo(NamedTuple):
+        hits: int
+        misses: int
+        maxsize: int
+        currsize: int
+    def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
+        self._cache: OrderedDict[Any, T] = collections.OrderedDict()
+        self._func = func
+        self._max_size = max_size
+        self._hits = 0
+        self._misses = 0
+        self._lock = threading.Lock()
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
+        if kwargs:
+            raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
+        with self._lock:
+            if args in self._cache:
+                self._cache.move_to_end(args)
+                self._hits += 1
+                return self._cache[args]
+        result = self._func(*args, **kwargs)
+        with self._lock:
+            self._cache[args] = result
+            self._misses += 1
+            if len(self._cache) > self._max_size:
+                self._cache.popitem(last=False)
+        return result
+    def is_key_cached(self, *args: Any) -> bool:
+        with self._lock:
+            return args in self._cache
+    def add_key(self, result: T, *args: Any) -> None:
+        with self._lock:
+            self._cache[args] = result
+            if len(self._cache) > self._max_size:
+                self._cache.popitem(last=False)
+    def cache_info(self) -> UpdatableLRU.CacheInfo:
+        with self._lock:
+            return self.CacheInfo(
+                maxsize=self._max_size,
+                currsize=len(self._cache),
+                hits=self._hits,
+                misses=self._misses,
+            )
+class BackgroundBlockCache(BaseCache):
+    """
+    Cache holding memory as a set of blocks with pre-loading of
+    the next block in the background.
+    Requests are only ever made ``blocksize`` at a time, and are
+    stored in an LRU cache. The least recently accessed block is
+    discarded when more than ``maxblocks`` are stored. If the
+    next block is not in cache, it is loaded in a separate thread
+    in non-blocking way.
+    Parameters
+    ----------
+    blocksize : int
+        The number of bytes to store in each block.
+        Requests are only ever made for ``blocksize``, so this
+        should balance the overhead of making a request against
+        the granularity of the blocks.
+    fetcher : Callable
+    size : int
+        The total size of the file being cached.
+    maxblocks : int
+        The maximum number of blocks to cache for. The maximum memory
+        use for this cache is then ``blocksize * maxblocks``.
+    """
+    name: ClassVar[str] = "background"
+    def __init__(
+        self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
+    ) -> None:
+        super().__init__(blocksize, fetcher, size)
+        self.nblocks = math.ceil(size / blocksize)
+        self.maxblocks = maxblocks
+        self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
+        self._thread_executor = ThreadPoolExecutor(max_workers=1)
+        self._fetch_future_block_number: int | None = None
+        self._fetch_future: Future[bytes] | None = None
+        self._fetch_future_lock = threading.Lock()
+    def cache_info(self) -> UpdatableLRU.CacheInfo:
+        """
+        The statistics on the block cache.
+        Returns
+        -------
+        NamedTuple
+            Returned directly from the LRU Cache used internally.
+        """
+        return self._fetch_block_cached.cache_info()
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__
+        del state["_fetch_block_cached"]
+        del state["_thread_executor"]
+        del state["_fetch_future_block_number"]
+        del state["_fetch_future"]
+        del state["_fetch_future_lock"]
+        return state
+    def __setstate__(self, state) -> None:
+        self.__dict__.update(state)
+        self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
+        self._thread_executor = ThreadPoolExecutor(max_workers=1)
+        self._fetch_future_block_number = None
+        self._fetch_future = None
+        self._fetch_future_lock = threading.Lock()
+    def _fetch(self, start: int | None, end: int | None) -> bytes:
+        if start is None:
+            start = 0
+        if end is None:
+            end = self.size
+        if start >= self.size or start >= end:
+            return b""
+        # byte position -> block numbers
+        start_block_number = start // self.blocksize
+        end_block_number = end // self.blocksize
+        fetch_future_block_number = None
+        fetch_future = None
+        with self._fetch_future_lock:
+            # Background thread is running. Check we we can or must join it.
+            if self._fetch_future is not None:
+                assert self._fetch_future_block_number is not None
+                if self._fetch_future.done():
+                    logger.info("BlockCache joined background fetch without waiting.")
+                    self._fetch_block_cached.add_key(
+                        self._fetch_future.result(), self._fetch_future_block_number
+                    )
+                    # Cleanup the fetch variables. Done with fetching the block.
+                    self._fetch_future_block_number = None
+                    self._fetch_future = None
+                else:
+                    # Must join if we need the block for the current fetch
+                    must_join = bool(
+                        start_block_number
+                        <= self._fetch_future_block_number
+                        <= end_block_number
+                    )
+                    if must_join:
+                        # Copy to the local variables to release lock
+                        # before waiting for result
+                        fetch_future_block_number = self._fetch_future_block_number
+                        fetch_future = self._fetch_future
+                        # Cleanup the fetch variables. Have a local copy.
+                        self._fetch_future_block_number = None
+                        self._fetch_future = None
+        # Need to wait for the future for the current read
+        if fetch_future is not None:
+            logger.info("BlockCache waiting for background fetch.")
+            # Wait until result and put it in cache
+            self._fetch_block_cached.add_key(
+                fetch_future.result(), fetch_future_block_number
+            )
+        # these are cached, so safe to do multiple calls for the same start and end.
+        for block_number in range(start_block_number, end_block_number + 1):
+            self._fetch_block_cached(block_number)
+        # fetch next block in the background if nothing is running in the background,
+        # the block is within file and it is not already cached
+        end_block_plus_1 = end_block_number + 1
+        with self._fetch_future_lock:
+            if (
+                self._fetch_future is None
+                and end_block_plus_1 <= self.nblocks
+                and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
+            ):
+                self._fetch_future_block_number = end_block_plus_1
+                self._fetch_future = self._thread_executor.submit(
+                    self._fetch_block, end_block_plus_1, "async"
+                )
+        return self._read_cache(
+            start,
+            end,
+            start_block_number=start_block_number,
+            end_block_number=end_block_number,
+        )
+    def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
+        """
+        Fetch the block of data for `block_number`.
+        """
+        if block_number > self.nblocks:
+            raise ValueError(
+                f"'block_number={block_number}' is greater than "
+                f"the number of blocks ({self.nblocks})"
+            )
+        start = block_number * self.blocksize
+        end = start + self.blocksize
+        logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
+        self.total_requested_bytes += end - start
+        self.miss_count += 1
+        block_contents = super()._fetch(start, end)
+        return block_contents
+    def _read_cache(
+        self, start: int, end: int, start_block_number: int, end_block_number: int
+    ) -> bytes:
+        """
+        Read from our block cache.
+        Parameters
+        ----------
+        start, end : int
+            The start and end byte positions.
+        start_block_number, end_block_number : int
+            The start and end block numbers.
+        """
+        start_pos = start % self.blocksize
+        end_pos = end % self.blocksize
+        # kind of pointless to count this as a hit, but it is
+        self.hit_count += 1
+        if start_block_number == end_block_number:
+            block = self._fetch_block_cached(start_block_number)
+            return block[start_pos:end_pos]
+        else:
+            # read from the initial
+            out = [self._fetch_block_cached(start_block_number)[start_pos:]]
+            # intermediate blocks
+            # Note: it'd be nice to combine these into one big request. However
+            # that doesn't play nicely with our LRU cache.
+            out.extend(
+                map(
+                    self._fetch_block_cached,
+                    range(start_block_number + 1, end_block_number),
+                )
+            )
+            # final block
+            out.append(self._fetch_block_cached(end_block_number)[:end_pos])
+            return b"".join(out)
+caches: dict[str | None, type[BaseCache]] = {
+    # one custom case
+    None: BaseCache,
+}
+def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
+    """'Register' cache implementation.
+    Parameters
+    ----------
+    clobber: bool, optional
+        If set to True (default is False) - allow to overwrite existing
+        entry.
+    Raises
+    ------
+    ValueError
+    """
+    name = cls.name
+    if not clobber and name in caches:
+        raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
+    caches[name] = cls
+for c in (
+    BaseCache,
+    MMapCache,
+    BytesCache,
+    ReadAheadCache,
+    BlockCache,
+    FirstChunkCache,
+    AllBytes,
+    KnownPartsOfAFile,
+    BackgroundBlockCache,
+):
+    register_cache(c)

.venv/lib/python3.11/site-packages/fsspec/callbacks.py ADDED Viewed

	@@ -0,0 +1,324 @@

+from functools import wraps
+class Callback:
+    """
+    Base class and interface for callback mechanism
+    This class can be used directly for monitoring file transfers by
+    providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
+    below), or subclassed for more specialised behaviour.
+    Parameters
+    ----------
+    size: int (optional)
+        Nominal quantity for the value that corresponds to a complete
+        transfer, e.g., total number of tiles or total number of
+        bytes
+    value: int (0)
+        Starting internal counter value
+    hooks: dict or None
+        A dict of named functions to be called on each update. The signature
+        of these must be ``f(size, value, **kwargs)``
+    """
+    def __init__(self, size=None, value=0, hooks=None, **kwargs):
+        self.size = size
+        self.value = value
+        self.hooks = hooks or {}
+        self.kw = kwargs
+    def __enter__(self):
+        return self
+    def __exit__(self, *exc_args):
+        self.close()
+    def close(self):
+        """Close callback."""
+    def branched(self, path_1, path_2, **kwargs):
+        """
+        Return callback for child transfers
+        If this callback is operating at a higher level, e.g., put, which may
+        trigger transfers that can also be monitored. The function returns a callback
+        that has to be passed to the child method, e.g., put_file,
+        as `callback=` argument.
+        The implementation uses `callback.branch` for compatibility.
+        When implementing callbacks, it is recommended to override this function instead
+        of `branch` and avoid calling `super().branched(...)`.
+        Prefer using this function over `branch`.
+        Parameters
+        ----------
+        path_1: str
+            Child's source path
+        path_2: str
+            Child's destination path
+        **kwargs:
+            Arbitrary keyword arguments
+        Returns
+        -------
+        callback: Callback
+            A callback instance to be passed to the child method
+        """
+        self.branch(path_1, path_2, kwargs)
+        # mutate kwargs so that we can force the caller to pass "callback=" explicitly
+        return kwargs.pop("callback", DEFAULT_CALLBACK)
+    def branch_coro(self, fn):
+        """
+        Wraps a coroutine, and pass a new child callback to it.
+        """
+        @wraps(fn)
+        async def func(path1, path2: str, **kwargs):
+            with self.branched(path1, path2, **kwargs) as child:
+                return await fn(path1, path2, callback=child, **kwargs)
+        return func
+    def set_size(self, size):
+        """
+        Set the internal maximum size attribute
+        Usually called if not initially set at instantiation. Note that this
+        triggers a ``call()``.
+        Parameters
+        ----------
+        size: int
+        """
+        self.size = size
+        self.call()
+    def absolute_update(self, value):
+        """
+        Set the internal value state
+        Triggers ``call()``
+        Parameters
+        ----------
+        value: int
+        """
+        self.value = value
+        self.call()
+    def relative_update(self, inc=1):
+        """
+        Delta increment the internal counter
+        Triggers ``call()``
+        Parameters
+        ----------
+        inc: int
+        """
+        self.value += inc
+        self.call()
+    def call(self, hook_name=None, **kwargs):
+        """
+        Execute hook(s) with current state
+        Each function is passed the internal size and current value
+        Parameters
+        ----------
+        hook_name: str or None
+            If given, execute on this hook
+        kwargs: passed on to (all) hook(s)
+        """
+        if not self.hooks:
+            return
+        kw = self.kw.copy()
+        kw.update(kwargs)
+        if hook_name:
+            if hook_name not in self.hooks:
+                return
+            return self.hooks[hook_name](self.size, self.value, **kw)
+        for hook in self.hooks.values() or []:
+            hook(self.size, self.value, **kw)
+    def wrap(self, iterable):
+        """
+        Wrap an iterable to call ``relative_update`` on each iterations
+        Parameters
+        ----------
+        iterable: Iterable
+            The iterable that is being wrapped
+        """
+        for item in iterable:
+            self.relative_update()
+            yield item
+    def branch(self, path_1, path_2, kwargs):
+        """
+        Set callbacks for child transfers
+        If this callback is operating at a higher level, e.g., put, which may
+        trigger transfers that can also be monitored. The passed kwargs are
+        to be *mutated* to add ``callback=``, if this class supports branching
+        to children.
+        Parameters
+        ----------
+        path_1: str
+            Child's source path
+        path_2: str
+            Child's destination path
+        kwargs: dict
+            arguments passed to child method, e.g., put_file.
+        Returns
+        -------
+        """
+        return None
+    def no_op(self, *_, **__):
+        pass
+    def __getattr__(self, item):
+        """
+        If undefined methods are called on this class, nothing happens
+        """
+        return self.no_op
+    @classmethod
+    def as_callback(cls, maybe_callback=None):
+        """Transform callback=... into Callback instance
+        For the special value of ``None``, return the global instance of
+        ``NoOpCallback``. This is an alternative to including
+        ``callback=DEFAULT_CALLBACK`` directly in a method signature.
+        """
+        if maybe_callback is None:
+            return DEFAULT_CALLBACK
+        return maybe_callback
+class NoOpCallback(Callback):
+    """
+    This implementation of Callback does exactly nothing
+    """
+    def call(self, *args, **kwargs):
+        return None
+class DotPrinterCallback(Callback):
+    """
+    Simple example Callback implementation
+    Almost identical to Callback with a hook that prints a char; here we
+    demonstrate how the outer layer may print "#" and the inner layer "."
+    """
+    def __init__(self, chr_to_print="#", **kwargs):
+        self.chr = chr_to_print
+        super().__init__(**kwargs)
+    def branch(self, path_1, path_2, kwargs):
+        """Mutate kwargs to add new instance with different print char"""
+        kwargs["callback"] = DotPrinterCallback(".")
+    def call(self, **kwargs):
+        """Just outputs a character"""
+        print(self.chr, end="")
+class TqdmCallback(Callback):
+    """
+    A callback to display a progress bar using tqdm
+    Parameters
+    ----------
+    tqdm_kwargs : dict, (optional)
+        Any argument accepted by the tqdm constructor.
+        See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
+        Will be forwarded to `tqdm_cls`.
+    tqdm_cls: (optional)
+        subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
+    Examples
+    --------
+    >>> import fsspec
+    >>> from fsspec.callbacks import TqdmCallback
+    >>> fs = fsspec.filesystem("memory")
+    >>> path2distant_data = "/your-path"
+    >>> fs.upload(
+            ".",
+            path2distant_data,
+            recursive=True,
+            callback=TqdmCallback(),
+        )
+    You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
+    >>> fs.upload(
+            ".",
+            path2distant_data,
+            recursive=True,
+            callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
+        )
+    You can also customize the progress bar by passing a subclass of `tqdm`.
+    .. code-block:: python
+        class TqdmFormat(tqdm):
+            '''Provides a `total_time` format parameter'''
+            @property
+            def format_dict(self):
+                d = super().format_dict
+                total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
+                d.update(total_time=self.format_interval(total_time) + " in total")
+                return d
+    >>> with TqdmCallback(
+            tqdm_kwargs={
+                "desc": "desc",
+                "bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
+            },
+            tqdm_cls=TqdmFormat,
+        ) as callback:
+            fs.upload(".", path2distant_data, recursive=True, callback=callback)
+    """
+    def __init__(self, tqdm_kwargs=None, *args, **kwargs):
+        try:
+            from tqdm import tqdm
+        except ImportError as exce:
+            raise ImportError(
+                "Using TqdmCallback requires tqdm to be installed"
+            ) from exce
+        self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
+        self._tqdm_kwargs = tqdm_kwargs or {}
+        self.tqdm = None
+        super().__init__(*args, **kwargs)
+    def call(self, *args, **kwargs):
+        if self.tqdm is None:
+            self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
+        self.tqdm.total = self.size
+        self.tqdm.update(self.value - self.tqdm.n)
+    def close(self):
+        if self.tqdm is not None:
+            self.tqdm.close()
+            self.tqdm = None
+    def __del__(self):
+        return self.close()
+DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()

.venv/lib/python3.11/site-packages/fsspec/compression.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Helper functions for a standard streaming compression API"""
+from zipfile import ZipFile
+import fsspec.utils
+from fsspec.spec import AbstractBufferedFile
+def noop_file(file, mode, **kwargs):
+    return file
+# TODO: files should also be available as contexts
+# should be functions of the form func(infile, mode=, **kwargs) -> file-like
+compr = {None: noop_file}
+def register_compression(name, callback, extensions, force=False):
+    """Register an "inferable" file compression type.
+    Registers transparent file compression type for use with fsspec.open.
+    Compression can be specified by name in open, or "infer"-ed for any files
+    ending with the given extensions.
+    Args:
+        name: (str) The compression type name. Eg. "gzip".
+        callback: A callable of form (infile, mode, **kwargs) -> file-like.
+            Accepts an input file-like object, the target mode and kwargs.
+            Returns a wrapped file-like object.
+        extensions: (str, Iterable[str]) A file extension, or list of file
+            extensions for which to infer this compression scheme. Eg. "gz".
+        force: (bool) Force re-registration of compression type or extensions.
+    Raises:
+        ValueError: If name or extensions already registered, and not force.
+    """
+    if isinstance(extensions, str):
+        extensions = [extensions]
+    # Validate registration
+    if name in compr and not force:
+        raise ValueError(f"Duplicate compression registration: {name}")
+    for ext in extensions:
+        if ext in fsspec.utils.compressions and not force:
+            raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
+    compr[name] = callback
+    for ext in extensions:
+        fsspec.utils.compressions[ext] = name
+def unzip(infile, mode="rb", filename=None, **kwargs):
+    if "r" not in mode:
+        filename = filename or "file"
+        z = ZipFile(infile, mode="w", **kwargs)
+        fo = z.open(filename, mode="w")
+        fo.close = lambda closer=fo.close: closer() or z.close()
+        return fo
+    z = ZipFile(infile)
+    if filename is None:
+        filename = z.namelist()[0]
+    return z.open(filename, mode="r", **kwargs)
+register_compression("zip", unzip, "zip")
+try:
+    from bz2 import BZ2File
+except ImportError:
+    pass
+else:
+    register_compression("bz2", BZ2File, "bz2")
+try:  # pragma: no cover
+    from isal import igzip
+    def isal(infile, mode="rb", **kwargs):
+        return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
+    register_compression("gzip", isal, "gz")
+except ImportError:
+    from gzip import GzipFile
+    register_compression(
+        "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
+    )
+try:
+    from lzma import LZMAFile
+    register_compression("lzma", LZMAFile, "lzma")
+    register_compression("xz", LZMAFile, "xz")
+except ImportError:
+    pass
+try:
+    import lzmaffi
+    register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
+    register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
+except ImportError:
+    pass
+class SnappyFile(AbstractBufferedFile):
+    def __init__(self, infile, mode, **kwargs):
+        import snappy
+        super().__init__(
+            fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
+        )
+        self.infile = infile
+        if "r" in mode:
+            self.codec = snappy.StreamDecompressor()
+        else:
+            self.codec = snappy.StreamCompressor()
+    def _upload_chunk(self, final=False):
+        self.buffer.seek(0)
+        out = self.codec.add_chunk(self.buffer.read())
+        self.infile.write(out)
+        return True
+    def seek(self, loc, whence=0):
+        raise NotImplementedError("SnappyFile is not seekable")
+    def seekable(self):
+        return False
+    def _fetch_range(self, start, end):
+        """Get the specified set of bytes from remote"""
+        data = self.infile.read(end - start)
+        return self.codec.decompress(data)
+try:
+    import snappy
+    snappy.compress(b"")
+    # Snappy may use the .sz file extension, but this is not part of the
+    # standard implementation.
+    register_compression("snappy", SnappyFile, [])
+except (ImportError, NameError, AttributeError):
+    pass
+try:
+    import lz4.frame
+    register_compression("lz4", lz4.frame.open, "lz4")
+except ImportError:
+    pass
+try:
+    import zstandard as zstd
+    def zstandard_file(infile, mode="rb"):
+        if "r" in mode:
+            cctx = zstd.ZstdDecompressor()
+            return cctx.stream_reader(infile)
+        else:
+            cctx = zstd.ZstdCompressor(level=10)
+            return cctx.stream_writer(infile)
+    register_compression("zstd", zstandard_file, "zst")
+except ImportError:
+    pass
+def available_compressions():
+    """Return a list of the implemented compressions."""
+    return list(compr)

.venv/lib/python3.11/site-packages/fsspec/config.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from __future__ import annotations
+import configparser
+import json
+import os
+import warnings
+from typing import Any
+conf: dict[str, dict[str, Any]] = {}
+default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
+conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
+def set_conf_env(conf_dict, envdict=os.environ):
+    """Set config values from environment variables
+    Looks for variables of the form ``FSSPEC_<protocol>`` and
+    ``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
+    as a json dictionary and used to ``update`` the config of the
+    corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
+    attempt to convert the string value, but the kwarg keys will be lower-cased.
+    The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
+    ``FSSPEC_<protocol>`` ones.
+    Parameters
+    ----------
+    conf_dict : dict(str, dict)
+        This dict will be mutated
+    envdict : dict-like(str, str)
+        Source for the values - usually the real environment
+    """
+    kwarg_keys = []
+    for key in envdict:
+        if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
+            if key.count("_") > 1:
+                kwarg_keys.append(key)
+                continue
+            try:
+                value = json.loads(envdict[key])
+            except json.decoder.JSONDecodeError as ex:
+                warnings.warn(
+                    f"Ignoring environment variable {key} due to a parse failure: {ex}"
+                )
+            else:
+                if isinstance(value, dict):
+                    _, proto = key.split("_", 1)
+                    conf_dict.setdefault(proto.lower(), {}).update(value)
+                else:
+                    warnings.warn(
+                        f"Ignoring environment variable {key} due to not being a dict:"
+                        f" {type(value)}"
+                    )
+        elif key.startswith("FSSPEC"):
+            warnings.warn(
+                f"Ignoring environment variable {key} due to having an unexpected name"
+            )
+    for key in kwarg_keys:
+        _, proto, kwarg = key.split("_", 2)
+        conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
+def set_conf_files(cdir, conf_dict):
+    """Set config values from files
+    Scans for INI and JSON files in the given dictionary, and uses their
+    contents to set the config. In case of repeated values, later values
+    win.
+    In the case of INI files, all values are strings, and these will not
+    be converted.
+    Parameters
+    ----------
+    cdir : str
+        Directory to search
+    conf_dict : dict(str, dict)
+        This dict will be mutated
+    """
+    if not os.path.isdir(cdir):
+        return
+    allfiles = sorted(os.listdir(cdir))
+    for fn in allfiles:
+        if fn.endswith(".ini"):
+            ini = configparser.ConfigParser()
+            ini.read(os.path.join(cdir, fn))
+            for key in ini:
+                if key == "DEFAULT":
+                    continue
+                conf_dict.setdefault(key, {}).update(dict(ini[key]))
+        if fn.endswith(".json"):
+            with open(os.path.join(cdir, fn)) as f:
+                js = json.load(f)
+            for key in js:
+                conf_dict.setdefault(key, {}).update(dict(js[key]))
+def apply_config(cls, kwargs, conf_dict=None):
+    """Supply default values for kwargs when instantiating class
+    Augments the passed kwargs, by finding entries in the config dict
+    which match the classes ``.protocol`` attribute (one or more str)
+    Parameters
+    ----------
+    cls : file system implementation
+    kwargs : dict
+    conf_dict : dict of dict
+        Typically this is the global configuration
+    Returns
+    -------
+    dict : the modified set of kwargs
+    """
+    if conf_dict is None:
+        conf_dict = conf
+    protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
+    kw = {}
+    for proto in protos:
+        # default kwargs from the current state of the config
+        if proto in conf_dict:
+            kw.update(conf_dict[proto])
+    # explicit kwargs always win
+    kw.update(**kwargs)
+    kwargs = kw
+    return kwargs
+set_conf_files(conf_dir, conf)
+set_conf_env(conf)

.venv/lib/python3.11/site-packages/fsspec/conftest.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import shutil
+import subprocess
+import sys
+import time
+import pytest
+import fsspec
+from fsspec.implementations.cached import CachingFileSystem
+@pytest.fixture()
+def m():
+    """
+    Fixture providing a memory filesystem.
+    """
+    m = fsspec.filesystem("memory")
+    m.store.clear()
+    m.pseudo_dirs.clear()
+    m.pseudo_dirs.append("")
+    try:
+        yield m
+    finally:
+        m.store.clear()
+        m.pseudo_dirs.clear()
+        m.pseudo_dirs.append("")
+@pytest.fixture
+def ftp_writable(tmpdir):
+    """
+    Fixture providing a writable FTP filesystem.
+    """
+    pytest.importorskip("pyftpdlib")
+    from fsspec.implementations.ftp import FTPFileSystem
+    FTPFileSystem.clear_instance_cache()  # remove lingering connections
+    CachingFileSystem.clear_instance_cache()
+    d = str(tmpdir)
+    with open(os.path.join(d, "out"), "wb") as f:
+        f.write(b"hello" * 10000)
+    P = subprocess.Popen(
+        [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
+    )
+    try:
+        time.sleep(1)
+        yield "localhost", 2121, "user", "pass"
+    finally:
+        P.terminate()
+        P.wait()
+        try:
+            shutil.rmtree(tmpdir)
+        except Exception:
+            pass

.venv/lib/python3.11/site-packages/fsspec/core.py ADDED Viewed

	@@ -0,0 +1,743 @@

+from __future__ import annotations
+import io
+import logging
+import os
+import re
+from glob import has_magic
+from pathlib import Path
+# for backwards compat, we export cache things from here too
+from fsspec.caching import (  # noqa: F401
+    BaseCache,
+    BlockCache,
+    BytesCache,
+    MMapCache,
+    ReadAheadCache,
+    caches,
+)
+from fsspec.compression import compr
+from fsspec.config import conf
+from fsspec.registry import filesystem, get_filesystem_class
+from fsspec.utils import (
+    _unstrip_protocol,
+    build_name_function,
+    infer_compression,
+    stringify_path,
+)
+logger = logging.getLogger("fsspec")
+class OpenFile:
+    """
+    File-like object to be used in a context
+    Can layer (buffered) text-mode and compression over any file-system, which
+    are typically binary-only.
+    These instances are safe to serialize, as the low-level file object
+    is not created until invoked using ``with``.
+    Parameters
+    ----------
+    fs: FileSystem
+        The file system to use for opening the file. Should be a subclass or duck-type
+        with ``fsspec.spec.AbstractFileSystem``
+    path: str
+        Location to open
+    mode: str like 'rb', optional
+        Mode of the opened file
+    compression: str or None, optional
+        Compression to apply
+    encoding: str or None, optional
+        The encoding to use if opened in text mode.
+    errors: str or None, optional
+        How to handle encoding errors if opened in text mode.
+    newline: None or str
+        Passed to TextIOWrapper in text mode, how to handle line endings.
+    autoopen: bool
+        If True, calls open() immediately. Mostly used by pickle
+    pos: int
+        If given and autoopen is True, seek to this location immediately
+    """
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        compression=None,
+        encoding=None,
+        errors=None,
+        newline=None,
+    ):
+        self.fs = fs
+        self.path = path
+        self.mode = mode
+        self.compression = get_compression(path, compression)
+        self.encoding = encoding
+        self.errors = errors
+        self.newline = newline
+        self.fobjects = []
+    def __reduce__(self):
+        return (
+            OpenFile,
+            (
+                self.fs,
+                self.path,
+                self.mode,
+                self.compression,
+                self.encoding,
+                self.errors,
+                self.newline,
+            ),
+        )
+    def __repr__(self):
+        return f"<OpenFile '{self.path}'>"
+    def __enter__(self):
+        mode = self.mode.replace("t", "").replace("b", "") + "b"
+        try:
+            f = self.fs.open(self.path, mode=mode)
+        except FileNotFoundError as e:
+            if has_magic(self.path):
+                raise FileNotFoundError(
+                    "%s not found. The URL contains glob characters: you maybe needed\n"
+                    "to pass expand=True in fsspec.open() or the storage_options of \n"
+                    "your library. You can also set the config value 'open_expand'\n"
+                    "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
+                    self.path,
+                ) from e
+            raise
+        self.fobjects = [f]
+        if self.compression is not None:
+            compress = compr[self.compression]
+            f = compress(f, mode=mode[0])
+            self.fobjects.append(f)
+        if "b" not in self.mode:
+            # assume, for example, that 'r' is equivalent to 'rt' as in builtin
+            f = PickleableTextIOWrapper(
+                f, encoding=self.encoding, errors=self.errors, newline=self.newline
+            )
+            self.fobjects.append(f)
+        return self.fobjects[-1]
+    def __exit__(self, *args):
+        self.close()
+    @property
+    def full_name(self):
+        return _unstrip_protocol(self.path, self.fs)
+    def open(self):
+        """Materialise this as a real open file without context
+        The OpenFile object should be explicitly closed to avoid enclosed file
+        instances persisting. You must, therefore, keep a reference to the OpenFile
+        during the life of the file-like it generates.
+        """
+        return self.__enter__()
+    def close(self):
+        """Close all encapsulated file objects"""
+        for f in reversed(self.fobjects):
+            if "r" not in self.mode and not f.closed:
+                f.flush()
+            f.close()
+        self.fobjects.clear()
+class OpenFiles(list):
+    """List of OpenFile instances
+    Can be used in a single context, which opens and closes all of the
+    contained files. Normal list access to get the elements works as
+    normal.
+    A special case is made for caching filesystems - the files will
+    be down/uploaded together at the start or end of the context, and
+    this may happen concurrently, if the target filesystem supports it.
+    """
+    def __init__(self, *args, mode="rb", fs=None):
+        self.mode = mode
+        self.fs = fs
+        self.files = []
+        super().__init__(*args)
+    def __enter__(self):
+        if self.fs is None:
+            raise ValueError("Context has already been used")
+        fs = self.fs
+        while True:
+            if hasattr(fs, "open_many"):
+                # check for concurrent cache download; or set up for upload
+                self.files = fs.open_many(self)
+                return self.files
+            if hasattr(fs, "fs") and fs.fs is not None:
+                fs = fs.fs
+            else:
+                break
+        return [s.__enter__() for s in self]
+    def __exit__(self, *args):
+        fs = self.fs
+        [s.__exit__(*args) for s in self]
+        if "r" not in self.mode:
+            while True:
+                if hasattr(fs, "open_many"):
+                    # check for concurrent cache upload
+                    fs.commit_many(self.files)
+                    return
+                if hasattr(fs, "fs") and fs.fs is not None:
+                    fs = fs.fs
+                else:
+                    break
+    def __getitem__(self, item):
+        out = super().__getitem__(item)
+        if isinstance(item, slice):
+            return OpenFiles(out, mode=self.mode, fs=self.fs)
+        return out
+    def __repr__(self):
+        return f"<List of {len(self)} OpenFile instances>"
+def open_files(
+    urlpath,
+    mode="rb",
+    compression=None,
+    encoding="utf8",
+    errors=None,
+    name_function=None,
+    num=1,
+    protocol=None,
+    newline=None,
+    auto_mkdir=True,
+    expand=True,
+    **kwargs,
+):
+    """Given a path or paths, return a list of ``OpenFile`` objects.
+    For writing, a str path must contain the "*" character, which will be filled
+    in by increasing numbers, e.g., "part*" ->  "part1", "part2" if num=2.
+    For either reading or writing, can instead provide explicit list of paths.
+    Parameters
+    ----------
+    urlpath: string or list
+        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
+        to read from alternative filesystems. To read from multiple files you
+        can pass a globstring or a list of paths, with the caveat that they
+        must all have the same protocol.
+    mode: 'rb', 'wt', etc.
+    compression: string or None
+        If given, open file using compression codec. Can either be a compression
+        name (a key in ``fsspec.compression.compr``) or "infer" to guess the
+        compression from the filename suffix.
+    encoding: str
+        For text mode only
+    errors: None or str
+        Passed to TextIOWrapper in text mode
+    name_function: function or None
+        if opening a set of files for writing, those files do not yet exist,
+        so we need to generate their names by formatting the urlpath for
+        each sequence number
+    num: int [1]
+        if writing mode, number of files we expect to create (passed to
+        name+function)
+    protocol: str or None
+        If given, overrides the protocol found in the URL.
+    newline: bytes or None
+        Used for line terminator in text mode. If None, uses system default;
+        if blank, uses no translation.
+    auto_mkdir: bool (True)
+        If in write mode, this will ensure the target directory exists before
+        writing, by calling ``fs.mkdirs(exist_ok=True)``.
+    expand: bool
+    **kwargs: dict
+        Extra options that make sense to a particular storage connection, e.g.
+        host, port, username, password, etc.
+    Examples
+    --------
+    >>> files = open_files('2015-*-*.csv')  # doctest: +SKIP
+    >>> files = open_files(
+    ...     's3://bucket/2015-*-*.csv.gz', compression='gzip'
+    ... )  # doctest: +SKIP
+    Returns
+    -------
+    An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
+    be used as a single context
+    Notes
+    -----
+    For a full list of the available protocols and the implementations that
+    they map across to see the latest online documentation:
+    - For implementations built into ``fsspec`` see
+      https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
+    - For implementations in separate packages see
+      https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
+    """
+    fs, fs_token, paths = get_fs_token_paths(
+        urlpath,
+        mode,
+        num=num,
+        name_function=name_function,
+        storage_options=kwargs,
+        protocol=protocol,
+        expand=expand,
+    )
+    if fs.protocol == "file":
+        fs.auto_mkdir = auto_mkdir
+    elif "r" not in mode and auto_mkdir:
+        parents = {fs._parent(path) for path in paths}
+        for parent in parents:
+            try:
+                fs.makedirs(parent, exist_ok=True)
+            except PermissionError:
+                pass
+    return OpenFiles(
+        [
+            OpenFile(
+                fs,
+                path,
+                mode=mode,
+                compression=compression,
+                encoding=encoding,
+                errors=errors,
+                newline=newline,
+            )
+            for path in paths
+        ],
+        mode=mode,
+        fs=fs,
+    )
+def _un_chain(path, kwargs):
+    # Avoid a circular import
+    from fsspec.implementations.cached import CachingFileSystem
+    if "::" in path:
+        x = re.compile(".*[^a-z]+.*")  # test for non protocol-like single word
+        bits = []
+        for p in path.split("::"):
+            if "://" in p or x.match(p):
+                bits.append(p)
+            else:
+                bits.append(p + "://")
+    else:
+        bits = [path]
+    # [[url, protocol, kwargs], ...]
+    out = []
+    previous_bit = None
+    kwargs = kwargs.copy()
+    for bit in reversed(bits):
+        protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
+        cls = get_filesystem_class(protocol)
+        extra_kwargs = cls._get_kwargs_from_urls(bit)
+        kws = kwargs.pop(protocol, {})
+        if bit is bits[0]:
+            kws.update(kwargs)
+        kw = dict(
+            **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
+            **kws,
+        )
+        bit = cls._strip_protocol(bit)
+        if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
+            bit = previous_bit
+        out.append((bit, protocol, kw))
+        previous_bit = bit
+    out.reverse()
+    return out
+def url_to_fs(url, **kwargs):
+    """
+    Turn fully-qualified and potentially chained URL into filesystem instance
+    Parameters
+    ----------
+    url : str
+        The fsspec-compatible URL
+    **kwargs: dict
+        Extra options that make sense to a particular storage connection, e.g.
+        host, port, username, password, etc.
+    Returns
+    -------
+    filesystem : FileSystem
+        The new filesystem discovered from ``url`` and created with
+        ``**kwargs``.
+    urlpath : str
+        The file-systems-specific URL for ``url``.
+    """
+    url = stringify_path(url)
+    # non-FS arguments that appear in fsspec.open()
+    # inspect could keep this in sync with open()'s signature
+    known_kwargs = {
+        "compression",
+        "encoding",
+        "errors",
+        "expand",
+        "mode",
+        "name_function",
+        "newline",
+        "num",
+    }
+    kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
+    chain = _un_chain(url, kwargs)
+    inkwargs = {}
+    # Reverse iterate the chain, creating a nested target_* structure
+    for i, ch in enumerate(reversed(chain)):
+        urls, protocol, kw = ch
+        if i == len(chain) - 1:
+            inkwargs = dict(**kw, **inkwargs)
+            continue
+        inkwargs["target_options"] = dict(**kw, **inkwargs)
+        inkwargs["target_protocol"] = protocol
+        inkwargs["fo"] = urls
+    urlpath, protocol, _ = chain[0]
+    fs = filesystem(protocol, **inkwargs)
+    return fs, urlpath
+DEFAULT_EXPAND = conf.get("open_expand", False)
+def open(
+    urlpath,
+    mode="rb",
+    compression=None,
+    encoding="utf8",
+    errors=None,
+    protocol=None,
+    newline=None,
+    expand=None,
+    **kwargs,
+):
+    """Given a path or paths, return one ``OpenFile`` object.
+    Parameters
+    ----------
+    urlpath: string or list
+        Absolute or relative filepath. Prefix with a protocol like ``s3://``
+        to read from alternative filesystems. Should not include glob
+        character(s).
+    mode: 'rb', 'wt', etc.
+    compression: string or None
+        If given, open file using compression codec. Can either be a compression
+        name (a key in ``fsspec.compression.compr``) or "infer" to guess the
+        compression from the filename suffix.
+    encoding: str
+        For text mode only
+    errors: None or str
+        Passed to TextIOWrapper in text mode
+    protocol: str or None
+        If given, overrides the protocol found in the URL.
+    newline: bytes or None
+        Used for line terminator in text mode. If None, uses system default;
+        if blank, uses no translation.
+    expand: bool or Nonw
+        Whether to regard file paths containing special glob characters as needing
+        expansion (finding the first match) or absolute. Setting False allows using
+        paths which do embed such characters. If None (default), this argument
+        takes its value from the DEFAULT_EXPAND module variable, which takes
+        its initial value from the "open_expand" config value at startup, which will
+        be False if not set.
+    **kwargs: dict
+        Extra options that make sense to a particular storage connection, e.g.
+        host, port, username, password, etc.
+    Examples
+    --------
+    >>> openfile = open('2015-01-01.csv')  # doctest: +SKIP
+    >>> openfile = open(
+    ...     's3://bucket/2015-01-01.csv.gz', compression='gzip'
+    ... )  # doctest: +SKIP
+    >>> with openfile as f:
+    ...     df = pd.read_csv(f)  # doctest: +SKIP
+    ...
+    Returns
+    -------
+    ``OpenFile`` object.
+    Notes
+    -----
+    For a full list of the available protocols and the implementations that
+    they map across to see the latest online documentation:
+    - For implementations built into ``fsspec`` see
+      https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
+    - For implementations in separate packages see
+      https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
+    """
+    expand = DEFAULT_EXPAND if expand is None else expand
+    out = open_files(
+        urlpath=[urlpath],
+        mode=mode,
+        compression=compression,
+        encoding=encoding,
+        errors=errors,
+        protocol=protocol,
+        newline=newline,
+        expand=expand,
+        **kwargs,
+    )
+    if not out:
+        raise FileNotFoundError(urlpath)
+    return out[0]
+def open_local(
+    url: str | list[str] | Path | list[Path],
+    mode: str = "rb",
+    **storage_options: dict,
+) -> str | list[str]:
+    """Open file(s) which can be resolved to local
+    For files which either are local, or get downloaded upon open
+    (e.g., by file caching)
+    Parameters
+    ----------
+    url: str or list(str)
+    mode: str
+        Must be read mode
+    storage_options:
+        passed on to FS for or used by open_files (e.g., compression)
+    """
+    if "r" not in mode:
+        raise ValueError("Can only ensure local files when reading")
+    of = open_files(url, mode=mode, **storage_options)
+    if not getattr(of[0].fs, "local_file", False):
+        raise ValueError(
+            "open_local can only be used on a filesystem which"
+            " has attribute local_file=True"
+        )
+    with of as files:
+        paths = [f.name for f in files]
+    if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
+        return paths[0]
+    return paths
+def get_compression(urlpath, compression):
+    if compression == "infer":
+        compression = infer_compression(urlpath)
+    if compression is not None and compression not in compr:
+        raise ValueError(f"Compression type {compression} not supported")
+    return compression
+def split_protocol(urlpath):
+    """Return protocol, path pair"""
+    urlpath = stringify_path(urlpath)
+    if "://" in urlpath:
+        protocol, path = urlpath.split("://", 1)
+        if len(protocol) > 1:
+            # excludes Windows paths
+            return protocol, path
+    if urlpath.startswith("data:"):
+        return urlpath.split(":", 1)
+    return None, urlpath
+def strip_protocol(urlpath):
+    """Return only path part of full URL, according to appropriate backend"""
+    protocol, _ = split_protocol(urlpath)
+    cls = get_filesystem_class(protocol)
+    return cls._strip_protocol(urlpath)
+def expand_paths_if_needed(paths, mode, num, fs, name_function):
+    """Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
+    in them (read mode).
+    :param paths: list of paths
+    mode: str
+        Mode in which to open files.
+    num: int
+        If opening in writing mode, number of files we expect to create.
+    fs: filesystem object
+    name_function: callable
+        If opening in writing mode, this callable is used to generate path
+        names. Names are generated for each partition by
+        ``urlpath.replace('*', name_function(partition_index))``.
+    :return: list of paths
+    """
+    expanded_paths = []
+    paths = list(paths)
+    if "w" in mode:  # read mode
+        if sum(1 for p in paths if "*" in p) > 1:
+            raise ValueError(
+                "When writing data, only one filename mask can be specified."
+            )
+        num = max(num, len(paths))
+        for curr_path in paths:
+            if "*" in curr_path:
+                # expand using name_function
+                expanded_paths.extend(_expand_paths(curr_path, name_function, num))
+            else:
+                expanded_paths.append(curr_path)
+        # if we generated more paths that asked for, trim the list
+        if len(expanded_paths) > num:
+            expanded_paths = expanded_paths[:num]
+    else:  # read mode
+        for curr_path in paths:
+            if has_magic(curr_path):
+                # expand using glob
+                expanded_paths.extend(fs.glob(curr_path))
+            else:
+                expanded_paths.append(curr_path)
+    return expanded_paths
+def get_fs_token_paths(
+    urlpath,
+    mode="rb",
+    num=1,
+    name_function=None,
+    storage_options=None,
+    protocol=None,
+    expand=True,
+):
+    """Filesystem, deterministic token, and paths from a urlpath and options.
+    Parameters
+    ----------
+    urlpath: string or iterable
+        Absolute or relative filepath, URL (may include protocols like
+        ``s3://``), or globstring pointing to data.
+    mode: str, optional
+        Mode in which to open files.
+    num: int, optional
+        If opening in writing mode, number of files we expect to create.
+    name_function: callable, optional
+        If opening in writing mode, this callable is used to generate path
+        names. Names are generated for each partition by
+        ``urlpath.replace('*', name_function(partition_index))``.
+    storage_options: dict, optional
+        Additional keywords to pass to the filesystem class.
+    protocol: str or None
+        To override the protocol specifier in the URL
+    expand: bool
+        Expand string paths for writing, assuming the path is a directory
+    """
+    if isinstance(urlpath, (list, tuple, set)):
+        if not urlpath:
+            raise ValueError("empty urlpath sequence")
+        urlpath0 = stringify_path(next(iter(urlpath)))
+    else:
+        urlpath0 = stringify_path(urlpath)
+    storage_options = storage_options or {}
+    if protocol:
+        storage_options["protocol"] = protocol
+    chain = _un_chain(urlpath0, storage_options or {})
+    inkwargs = {}
+    # Reverse iterate the chain, creating a nested target_* structure
+    for i, ch in enumerate(reversed(chain)):
+        urls, nested_protocol, kw = ch
+        if i == len(chain) - 1:
+            inkwargs = dict(**kw, **inkwargs)
+            continue
+        inkwargs["target_options"] = dict(**kw, **inkwargs)
+        inkwargs["target_protocol"] = nested_protocol
+        inkwargs["fo"] = urls
+    paths, protocol, _ = chain[0]
+    fs = filesystem(protocol, **inkwargs)
+    if isinstance(urlpath, (list, tuple, set)):
+        pchains = [
+            _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
+        ]
+        if len({pc[1] for pc in pchains}) > 1:
+            raise ValueError("Protocol mismatch getting fs from %s", urlpath)
+        paths = [pc[0] for pc in pchains]
+    else:
+        paths = fs._strip_protocol(paths)
+    if isinstance(paths, (list, tuple, set)):
+        if expand:
+            paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
+        elif not isinstance(paths, list):
+            paths = list(paths)
+    else:
+        if ("w" in mode or "x" in mode) and expand:
+            paths = _expand_paths(paths, name_function, num)
+        elif "*" in paths:
+            paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
+        else:
+            paths = [paths]
+    return fs, fs._fs_token, paths
+def _expand_paths(path, name_function, num):
+    if isinstance(path, str):
+        if path.count("*") > 1:
+            raise ValueError("Output path spec must contain exactly one '*'.")
+        elif "*" not in path:
+            path = os.path.join(path, "*.part")
+        if name_function is None:
+            name_function = build_name_function(num - 1)
+        paths = [path.replace("*", name_function(i)) for i in range(num)]
+        if paths != sorted(paths):
+            logger.warning(
+                "In order to preserve order between partitions"
+                " paths created with ``name_function`` should "
+                "sort to partition order"
+            )
+    elif isinstance(path, (tuple, list)):
+        assert len(path) == num
+        paths = list(path)
+    else:
+        raise ValueError(
+            "Path should be either\n"
+            "1. A list of paths: ['foo.json', 'bar.json', ...]\n"
+            "2. A directory: 'foo/\n"
+            "3. A path with a '*' in it: 'foo.*.json'"
+        )
+    return paths
+class PickleableTextIOWrapper(io.TextIOWrapper):
+    """TextIOWrapper cannot be pickled. This solves it.
+    Requires that ``buffer`` be pickleable, which all instances of
+    AbstractBufferedFile are.
+    """
+    def __init__(
+        self,
+        buffer,
+        encoding=None,
+        errors=None,
+        newline=None,
+        line_buffering=False,
+        write_through=False,
+    ):
+        self.args = buffer, encoding, errors, newline, line_buffering, write_through
+        super().__init__(*self.args)
+    def __reduce__(self):
+        return PickleableTextIOWrapper, self.args

.venv/lib/python3.11/site-packages/fsspec/dircache.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import time
+from collections.abc import MutableMapping
+from functools import lru_cache
+class DirCache(MutableMapping):
+    """
+    Caching of directory listings, in a structure like::
+        {"path0": [
+            {"name": "path0/file0",
+             "size": 123,
+             "type": "file",
+             ...
+            },
+            {"name": "path0/file1",
+            },
+            ...
+            ],
+         "path1": [...]
+        }
+    Parameters to this class control listing expiry or indeed turn
+    caching off
+    """
+    def __init__(
+        self,
+        use_listings_cache=True,
+        listings_expiry_time=None,
+        max_paths=None,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        use_listings_cache: bool
+            If False, this cache never returns items, but always reports KeyError,
+            and setting items has no effect
+        listings_expiry_time: int or float (optional)
+            Time in seconds that a listing is considered valid. If None,
+            listings do not expire.
+        max_paths: int (optional)
+            The number of most recent listings that are considered valid; 'recent'
+            refers to when the entry was set.
+        """
+        self._cache = {}
+        self._times = {}
+        if max_paths:
+            self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
+        self.use_listings_cache = use_listings_cache
+        self.listings_expiry_time = listings_expiry_time
+        self.max_paths = max_paths
+    def __getitem__(self, item):
+        if self.listings_expiry_time is not None:
+            if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
+                del self._cache[item]
+        if self.max_paths:
+            self._q(item)
+        return self._cache[item]  # maybe raises KeyError
+    def clear(self):
+        self._cache.clear()
+    def __len__(self):
+        return len(self._cache)
+    def __contains__(self, item):
+        try:
+            self[item]
+            return True
+        except KeyError:
+            return False
+    def __setitem__(self, key, value):
+        if not self.use_listings_cache:
+            return
+        if self.max_paths:
+            self._q(key)
+        self._cache[key] = value
+        if self.listings_expiry_time is not None:
+            self._times[key] = time.time()
+    def __delitem__(self, key):
+        del self._cache[key]
+    def __iter__(self):
+        entries = list(self._cache)
+        return (k for k in entries if k in self)
+    def __reduce__(self):
+        return (
+            DirCache,
+            (self.use_listings_cache, self.listings_expiry_time, self.max_paths),
+        )

.venv/lib/python3.11/site-packages/fsspec/exceptions.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+fsspec user-defined exception classes
+"""
+import asyncio
+class BlocksizeMismatchError(ValueError):
+    """
+    Raised when a cached file is opened with a different blocksize than it was
+    written with
+    """
+class FSTimeoutError(asyncio.TimeoutError):
+    """
+    Raised when a fsspec function timed out occurs
+    """

.venv/lib/python3.11/site-packages/fsspec/fuse.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import argparse
+import logging
+import os
+import stat
+import threading
+import time
+from errno import EIO, ENOENT
+from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
+from fsspec import __version__
+from fsspec.core import url_to_fs
+logger = logging.getLogger("fsspec.fuse")
+class FUSEr(Operations):
+    def __init__(self, fs, path, ready_file=False):
+        self.fs = fs
+        self.cache = {}
+        self.root = path.rstrip("/") + "/"
+        self.counter = 0
+        logger.info("Starting FUSE at %s", path)
+        self._ready_file = ready_file
+    def getattr(self, path, fh=None):
+        logger.debug("getattr %s", path)
+        if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
+            return {"type": "file", "st_size": 5}
+        path = "".join([self.root, path.lstrip("/")]).rstrip("/")
+        try:
+            info = self.fs.info(path)
+        except FileNotFoundError as exc:
+            raise FuseOSError(ENOENT) from exc
+        data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
+        perm = info.get("mode", 0o777)
+        if info["type"] != "file":
+            data["st_mode"] = stat.S_IFDIR | perm
+            data["st_size"] = 0
+            data["st_blksize"] = 0
+        else:
+            data["st_mode"] = stat.S_IFREG | perm
+            data["st_size"] = info["size"]
+            data["st_blksize"] = 5 * 2**20
+            data["st_nlink"] = 1
+        data["st_atime"] = info["atime"] if "atime" in info else time.time()
+        data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
+        data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
+        return data
+    def readdir(self, path, fh):
+        logger.debug("readdir %s", path)
+        path = "".join([self.root, path.lstrip("/")])
+        files = self.fs.ls(path, False)
+        files = [os.path.basename(f.rstrip("/")) for f in files]
+        return [".", ".."] + files
+    def mkdir(self, path, mode):
+        path = "".join([self.root, path.lstrip("/")])
+        self.fs.mkdir(path)
+        return 0
+    def rmdir(self, path):
+        path = "".join([self.root, path.lstrip("/")])
+        self.fs.rmdir(path)
+        return 0
+    def read(self, path, size, offset, fh):
+        logger.debug("read %s", (path, size, offset))
+        if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
+            # status indicator
+            return b"ready"
+        f = self.cache[fh]
+        f.seek(offset)
+        out = f.read(size)
+        return out
+    def write(self, path, data, offset, fh):
+        logger.debug("write %s", (path, offset))
+        f = self.cache[fh]
+        f.seek(offset)
+        f.write(data)
+        return len(data)
+    def create(self, path, flags, fi=None):
+        logger.debug("create %s", (path, flags))
+        fn = "".join([self.root, path.lstrip("/")])
+        self.fs.touch(fn)  # OS will want to get attributes immediately
+        f = self.fs.open(fn, "wb")
+        self.cache[self.counter] = f
+        self.counter += 1
+        return self.counter - 1
+    def open(self, path, flags):
+        logger.debug("open %s", (path, flags))
+        fn = "".join([self.root, path.lstrip("/")])
+        if flags % 2 == 0:
+            # read
+            mode = "rb"
+        else:
+            # write/create
+            mode = "wb"
+        self.cache[self.counter] = self.fs.open(fn, mode)
+        self.counter += 1
+        return self.counter - 1
+    def truncate(self, path, length, fh=None):
+        fn = "".join([self.root, path.lstrip("/")])
+        if length != 0:
+            raise NotImplementedError
+        # maybe should be no-op since open with write sets size to zero anyway
+        self.fs.touch(fn)
+    def unlink(self, path):
+        fn = "".join([self.root, path.lstrip("/")])
+        try:
+            self.fs.rm(fn, False)
+        except (OSError, FileNotFoundError) as exc:
+            raise FuseOSError(EIO) from exc
+    def release(self, path, fh):
+        try:
+            if fh in self.cache:
+                f = self.cache[fh]
+                f.close()
+                self.cache.pop(fh)
+        except Exception as e:
+            print(e)
+        return 0
+    def chmod(self, path, mode):
+        if hasattr(self.fs, "chmod"):
+            path = "".join([self.root, path.lstrip("/")])
+            return self.fs.chmod(path, mode)
+        raise NotImplementedError
+def run(
+    fs,
+    path,
+    mount_point,
+    foreground=True,
+    threads=False,
+    ready_file=False,
+    ops_class=FUSEr,
+):
+    """Mount stuff in a local directory
+    This uses fusepy to make it appear as if a given path on an fsspec
+    instance is in fact resident within the local file-system.
+    This requires that fusepy by installed, and that FUSE be available on
+    the system (typically requiring a package to be installed with
+    apt, yum, brew, etc.).
+    Parameters
+    ----------
+    fs: file-system instance
+        From one of the compatible implementations
+    path: str
+        Location on that file-system to regard as the root directory to
+        mount. Note that you typically should include the terminating "/"
+        character.
+    mount_point: str
+        An empty directory on the local file-system where the contents of
+        the remote path will appear.
+    foreground: bool
+        Whether or not calling this function will block. Operation will
+        typically be more stable if True.
+    threads: bool
+        Whether or not to create threads when responding to file operations
+        within the mounter directory. Operation will typically be more
+        stable if False.
+    ready_file: bool
+        Whether the FUSE process is ready. The ``.fuse_ready`` file will
+        exist in the ``mount_point`` directory if True. Debugging purpose.
+    ops_class: FUSEr or Subclass of FUSEr
+        To override the default behavior of FUSEr. For Example, logging
+        to file.
+    """
+    func = lambda: FUSE(
+        ops_class(fs, path, ready_file=ready_file),
+        mount_point,
+        nothreads=not threads,
+        foreground=foreground,
+    )
+    if not foreground:
+        th = threading.Thread(target=func)
+        th.daemon = True
+        th.start()
+        return th
+    else:  # pragma: no cover
+        try:
+            func()
+        except KeyboardInterrupt:
+            pass
+def main(args):
+    """Mount filesystem from chained URL to MOUNT_POINT.
+    Examples:
+    python3 -m fsspec.fuse memory /usr/share /tmp/mem
+    python3 -m fsspec.fuse local /tmp/source /tmp/local \\
+            -l /tmp/fsspecfuse.log
+    You can also mount chained-URLs and use special settings:
+    python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
+            / /tmp/zip \\
+            -o 'filecache-cache_storage=/tmp/simplecache'
+    You can specify the type of the setting by using `[int]` or `[bool]`,
+    (`true`, `yes`, `1` represents the Boolean value `True`):
+    python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
+            /historic/packages/RPMS /tmp/ftp \\
+            -o 'simplecache-cache_storage=/tmp/simplecache' \\
+            -o 'simplecache-check_files=false[bool]' \\
+            -o 'ftp-listings_expiry_time=60[int]' \\
+            -o 'ftp-username=anonymous' \\
+            -o 'ftp-password=xieyanbo'
+    """
+    class RawDescriptionArgumentParser(argparse.ArgumentParser):
+        def format_help(self):
+            usage = super().format_help()
+            parts = usage.split("\n\n")
+            parts[1] = self.description.rstrip()
+            return "\n\n".join(parts)
+    parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
+    parser.add_argument("--version", action="version", version=__version__)
+    parser.add_argument("url", type=str, help="fs url")
+    parser.add_argument("source_path", type=str, help="source directory in fs")
+    parser.add_argument("mount_point", type=str, help="local directory")
+    parser.add_argument(
+        "-o",
+        "--option",
+        action="append",
+        help="Any options of protocol included in the chained URL",
+    )
+    parser.add_argument(
+        "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
+    )
+    parser.add_argument(
+        "-f",
+        "--foreground",
+        action="store_false",
+        help="Running in foreground or not (Default: False)",
+    )
+    parser.add_argument(
+        "-t",
+        "--threads",
+        action="store_false",
+        help="Running with threads support (Default: False)",
+    )
+    parser.add_argument(
+        "-r",
+        "--ready-file",
+        action="store_false",
+        help="The `.fuse_ready` file will exist after FUSE is ready. "
+        "(Debugging purpose, Default: False)",
+    )
+    args = parser.parse_args(args)
+    kwargs = {}
+    for item in args.option or []:
+        key, sep, value = item.partition("=")
+        if not sep:
+            parser.error(message=f"Wrong option: {item!r}")
+        val = value.lower()
+        if val.endswith("[int]"):
+            value = int(value[: -len("[int]")])
+        elif val.endswith("[bool]"):
+            value = val[: -len("[bool]")] in ["1", "yes", "true"]
+        if "-" in key:
+            fs_name, setting_name = key.split("-", 1)
+            if fs_name in kwargs:
+                kwargs[fs_name][setting_name] = value
+            else:
+                kwargs[fs_name] = {setting_name: value}
+        else:
+            kwargs[key] = value
+    if args.log_file:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            filename=args.log_file,
+            format="%(asctime)s %(message)s",
+        )
+        class LoggingFUSEr(FUSEr, LoggingMixIn):
+            pass
+        fuser = LoggingFUSEr
+    else:
+        fuser = FUSEr
+    fs, url_path = url_to_fs(args.url, **kwargs)
+    logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
+    run(
+        fs,
+        args.source_path,
+        args.mount_point,
+        foreground=args.foreground,
+        threads=args.threads,
+        ready_file=args.ready_file,
+        ops_class=fuser,
+    )
+if __name__ == "__main__":
+    import sys
+    main(sys.argv[1:])

.venv/lib/python3.11/site-packages/fsspec/generic.py ADDED Viewed

	@@ -0,0 +1,411 @@

+from __future__ import annotations
+import inspect
+import logging
+import os
+import shutil
+import uuid
+from typing import Optional
+from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
+from .callbacks import DEFAULT_CALLBACK
+from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
+_generic_fs = {}
+logger = logging.getLogger("fsspec.generic")
+def set_generic_fs(protocol, **storage_options):
+    _generic_fs[protocol] = filesystem(protocol, **storage_options)
+default_method = "default"
+def _resolve_fs(url, method=None, protocol=None, storage_options=None):
+    """Pick instance of backend FS"""
+    method = method or default_method
+    protocol = protocol or split_protocol(url)[0]
+    storage_options = storage_options or {}
+    if method == "default":
+        return filesystem(protocol)
+    if method == "generic":
+        return _generic_fs[protocol]
+    if method == "current":
+        cls = get_filesystem_class(protocol)
+        return cls.current()
+    if method == "options":
+        fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
+        return fs
+    raise ValueError(f"Unknown FS resolution method: {method}")
+def rsync(
+    source,
+    destination,
+    delete_missing=False,
+    source_field="size",
+    dest_field="size",
+    update_cond="different",
+    inst_kwargs=None,
+    fs=None,
+    **kwargs,
+):
+    """Sync files between two directory trees
+    (experimental)
+    Parameters
+    ----------
+    source: str
+        Root of the directory tree to take files from. This must be a directory, but
+        do not include any terminating "/" character
+    destination: str
+        Root path to copy into. The contents of this location should be
+        identical to the contents of ``source`` when done. This will be made a
+        directory, and the terminal "/" should not be included.
+    delete_missing: bool
+        If there are paths in the destination that don't exist in the
+        source and this is True, delete them. Otherwise, leave them alone.
+    source_field: str | callable
+        If ``update_field`` is "different", this is the key in the info
+        of source files to consider for difference. Maybe a function of the
+        info dict.
+    dest_field: str | callable
+        If ``update_field`` is "different", this is the key in the info
+        of destination files to consider for difference. May be a function of
+        the info dict.
+    update_cond: "different"|"always"|"never"
+        If "always", every file is copied, regardless of whether it exists in
+        the destination. If "never", files that exist in the destination are
+        not copied again. If "different" (default), only copy if the info
+        fields given by ``source_field`` and ``dest_field`` (usually "size")
+        are different. Other comparisons may be added in the future.
+    inst_kwargs: dict|None
+        If ``fs`` is None, use this set of keyword arguments to make a
+        GenericFileSystem instance
+    fs: GenericFileSystem|None
+        Instance to use if explicitly given. The instance defines how to
+        to make downstream file system instances from paths.
+    Returns
+    -------
+    dict of the copy operations that were performed, {source: destination}
+    """
+    fs = fs or GenericFileSystem(**(inst_kwargs or {}))
+    source = fs._strip_protocol(source)
+    destination = fs._strip_protocol(destination)
+    allfiles = fs.find(source, withdirs=True, detail=True)
+    if not fs.isdir(source):
+        raise ValueError("Can only rsync on a directory")
+    otherfiles = fs.find(destination, withdirs=True, detail=True)
+    dirs = [
+        a
+        for a, v in allfiles.items()
+        if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
+    ]
+    logger.debug(f"{len(dirs)} directories to create")
+    if dirs:
+        fs.make_many_dirs(
+            [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
+        )
+    allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
+    logger.debug(f"{len(allfiles)} files to consider for copy")
+    to_delete = [
+        o
+        for o, v in otherfiles.items()
+        if o.replace(destination, source) not in allfiles and v["type"] == "file"
+    ]
+    for k, v in allfiles.copy().items():
+        otherfile = k.replace(source, destination)
+        if otherfile in otherfiles:
+            if update_cond == "always":
+                allfiles[k] = otherfile
+            elif update_cond == "different":
+                inf1 = source_field(v) if callable(source_field) else v[source_field]
+                v2 = otherfiles[otherfile]
+                inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
+                if inf1 != inf2:
+                    # details mismatch, make copy
+                    allfiles[k] = otherfile
+                else:
+                    # details match, don't copy
+                    allfiles.pop(k)
+        else:
+            # file not in target yet
+            allfiles[k] = otherfile
+    logger.debug(f"{len(allfiles)} files to copy")
+    if allfiles:
+        source_files, target_files = zip(*allfiles.items())
+        fs.cp(source_files, target_files, **kwargs)
+    logger.debug(f"{len(to_delete)} files to delete")
+    if delete_missing and to_delete:
+        fs.rm(to_delete)
+    return allfiles
+class GenericFileSystem(AsyncFileSystem):
+    """Wrapper over all other FS types
+    <experimental!>
+    This implementation is a single unified interface to be able to run FS operations
+    over generic URLs, and dispatch to the specific implementations using the URL
+    protocol prefix.
+    Note: instances of this FS are always async, even if you never use it with any async
+    backend.
+    """
+    protocol = "generic"  # there is no real reason to ever use a protocol with this FS
+    def __init__(self, default_method="default", **kwargs):
+        """
+        Parameters
+        ----------
+        default_method: str (optional)
+            Defines how to configure backend FS instances. Options are:
+            - "default": instantiate like FSClass(), with no
+              extra arguments; this is the default instance of that FS, and can be
+              configured via the config system
+            - "generic": takes instances from the `_generic_fs` dict in this module,
+              which you must populate before use. Keys are by protocol
+            - "current": takes the most recently instantiated version of each FS
+        """
+        self.method = default_method
+        super().__init__(**kwargs)
+    def _parent(self, path):
+        fs = _resolve_fs(path, self.method)
+        return fs.unstrip_protocol(fs._parent(path))
+    def _strip_protocol(self, path):
+        # normalization only
+        fs = _resolve_fs(path, self.method)
+        return fs.unstrip_protocol(fs._strip_protocol(path))
+    async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        fs = _resolve_fs(path, self.method)
+        if fs.async_impl:
+            out = await fs._find(
+                path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
+            )
+        else:
+            out = fs.find(
+                path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
+            )
+        result = {}
+        for k, v in out.items():
+            v = v.copy()  # don't corrupt target FS dircache
+            name = fs.unstrip_protocol(k)
+            v["name"] = name
+            result[name] = v
+        if detail:
+            return result
+        return list(result)
+    async def _info(self, url, **kwargs):
+        fs = _resolve_fs(url, self.method)
+        if fs.async_impl:
+            out = await fs._info(url, **kwargs)
+        else:
+            out = fs.info(url, **kwargs)
+        out = out.copy()  # don't edit originals
+        out["name"] = fs.unstrip_protocol(out["name"])
+        return out
+    async def _ls(
+        self,
+        url,
+        detail=True,
+        **kwargs,
+    ):
+        fs = _resolve_fs(url, self.method)
+        if fs.async_impl:
+            out = await fs._ls(url, detail=True, **kwargs)
+        else:
+            out = fs.ls(url, detail=True, **kwargs)
+        out = [o.copy() for o in out]  # don't edit originals
+        for o in out:
+            o["name"] = fs.unstrip_protocol(o["name"])
+        if detail:
+            return out
+        else:
+            return [o["name"] for o in out]
+    async def _cat_file(
+        self,
+        url,
+        **kwargs,
+    ):
+        fs = _resolve_fs(url, self.method)
+        if fs.async_impl:
+            return await fs._cat_file(url, **kwargs)
+        else:
+            return fs.cat_file(url, **kwargs)
+    async def _pipe_file(
+        self,
+        path,
+        value,
+        **kwargs,
+    ):
+        fs = _resolve_fs(path, self.method)
+        if fs.async_impl:
+            return await fs._pipe_file(path, value, **kwargs)
+        else:
+            return fs.pipe_file(path, value, **kwargs)
+    async def _rm(self, url, **kwargs):
+        urls = url
+        if isinstance(urls, str):
+            urls = [urls]
+        fs = _resolve_fs(urls[0], self.method)
+        if fs.async_impl:
+            await fs._rm(urls, **kwargs)
+        else:
+            fs.rm(url, **kwargs)
+    async def _makedirs(self, path, exist_ok=False):
+        logger.debug("Make dir %s", path)
+        fs = _resolve_fs(path, self.method)
+        if fs.async_impl:
+            await fs._makedirs(path, exist_ok=exist_ok)
+        else:
+            fs.makedirs(path, exist_ok=exist_ok)
+    def rsync(self, source, destination, **kwargs):
+        """Sync files between two directory trees
+        See `func:rsync` for more details.
+        """
+        rsync(source, destination, fs=self, **kwargs)
+    async def _cp_file(
+        self,
+        url,
+        url2,
+        blocksize=2**20,
+        callback=DEFAULT_CALLBACK,
+        **kwargs,
+    ):
+        fs = _resolve_fs(url, self.method)
+        fs2 = _resolve_fs(url2, self.method)
+        if fs is fs2:
+            # pure remote
+            if fs.async_impl:
+                return await fs._cp_file(url, url2, **kwargs)
+            else:
+                return fs.cp_file(url, url2, **kwargs)
+        kw = {"blocksize": 0, "cache_type": "none"}
+        try:
+            f1 = (
+                await fs.open_async(url, "rb")
+                if hasattr(fs, "open_async")
+                else fs.open(url, "rb", **kw)
+            )
+            callback.set_size(await maybe_await(f1.size))
+            f2 = (
+                await fs2.open_async(url2, "wb")
+                if hasattr(fs2, "open_async")
+                else fs2.open(url2, "wb", **kw)
+            )
+            while f1.size is None or f2.tell() < f1.size:
+                data = await maybe_await(f1.read(blocksize))
+                if f1.size is None and not data:
+                    break
+                await maybe_await(f2.write(data))
+                callback.absolute_update(f2.tell())
+        finally:
+            try:
+                await maybe_await(f2.close())
+                await maybe_await(f1.close())
+            except NameError:
+                # fail while opening f1 or f2
+                pass
+    async def _make_many_dirs(self, urls, exist_ok=True):
+        fs = _resolve_fs(urls[0], self.method)
+        if fs.async_impl:
+            coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
+            await _run_coros_in_chunks(coros)
+        else:
+            for u in urls:
+                fs.makedirs(u, exist_ok=exist_ok)
+    make_many_dirs = sync_wrapper(_make_many_dirs)
+    async def _copy(
+        self,
+        path1: list[str],
+        path2: list[str],
+        recursive: bool = False,
+        on_error: str = "ignore",
+        maxdepth: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        tempdir: Optional[str] = None,
+        **kwargs,
+    ):
+        if recursive:
+            raise NotImplementedError
+        fs = _resolve_fs(path1[0], self.method)
+        fs2 = _resolve_fs(path2[0], self.method)
+        # not expanding paths atm., assume call is from rsync()
+        if fs is fs2:
+            # pure remote
+            if fs.async_impl:
+                return await fs._copy(path1, path2, **kwargs)
+            else:
+                return fs.copy(path1, path2, **kwargs)
+        await copy_file_op(
+            fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
+        )
+async def copy_file_op(
+    fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
+):
+    import tempfile
+    tempdir = tempdir or tempfile.mkdtemp()
+    try:
+        coros = [
+            _copy_file_op(
+                fs1,
+                u1,
+                fs2,
+                u2,
+                os.path.join(tempdir, uuid.uuid4().hex),
+                on_error=on_error,
+            )
+            for u1, u2 in zip(url1, url2)
+        ]
+        await _run_coros_in_chunks(coros, batch_size=batch_size)
+    finally:
+        shutil.rmtree(tempdir)
+async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
+    ex = () if on_error == "raise" else Exception
+    logger.debug("Copy %s -> %s", url1, url2)
+    try:
+        if fs1.async_impl:
+            await fs1._get_file(url1, local)
+        else:
+            fs1.get_file(url1, local)
+        if fs2.async_impl:
+            await fs2._put_file(local, url2)
+        else:
+            fs2.put_file(local, url2)
+        os.unlink(local)
+        logger.debug("Copy %s -> %s; done", url1, url2)
+    except ex as e:
+        logger.debug("ignoring cp exception for %s: %s", url1, e)
+async def maybe_await(cor):
+    if inspect.iscoroutine(cor):
+        return await cor
+    else:
+        return cor

.venv/lib/python3.11/site-packages/fsspec/gui.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import ast
+import contextlib
+import logging
+import os
+import re
+from typing import ClassVar, Sequence
+import panel as pn
+from .core import OpenFile, get_filesystem_class, split_protocol
+from .registry import known_implementations
+pn.extension()
+logger = logging.getLogger("fsspec.gui")
+class SigSlot:
+    """Signal-slot mixin, for Panel event passing
+    Include this class in a widget manager's superclasses to be able to
+    register events and callbacks on Panel widgets managed by that class.
+    The method ``_register`` should be called as widgets are added, and external
+    code should call ``connect`` to associate callbacks.
+    By default, all signals emit a DEBUG logging statement.
+    """
+    # names of signals that this class may emit each of which must be
+    # set by _register for any new instance
+    signals: ClassVar[Sequence[str]] = []
+    # names of actions that this class may respond to
+    slots: ClassVar[Sequence[str]] = []
+    # each of which must be a method name
+    def __init__(self):
+        self._ignoring_events = False
+        self._sigs = {}
+        self._map = {}
+        self._setup()
+    def _setup(self):
+        """Create GUI elements and register signals"""
+        self.panel = pn.pane.PaneBase()
+        # no signals to set up in the base class
+    def _register(
+        self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
+    ):
+        """Watch the given attribute of a widget and assign it a named event
+        This is normally called at the time a widget is instantiated, in the
+        class which owns it.
+        Parameters
+        ----------
+        widget : pn.layout.Panel or None
+            Widget to watch. If None, an anonymous signal not associated with
+            any widget.
+        name : str
+            Name of this event
+        thing : str
+            Attribute of the given widget to watch
+        log_level : int
+            When the signal is triggered, a logging event of the given level
+            will be fired in the dfviz logger.
+        auto : bool
+            If True, automatically connects with a method in this class of the
+            same name.
+        """
+        if name not in self.signals:
+            raise ValueError(f"Attempt to assign an undeclared signal: {name}")
+        self._sigs[name] = {
+            "widget": widget,
+            "callbacks": [],
+            "thing": thing,
+            "log": log_level,
+        }
+        wn = "-".join(
+            [
+                getattr(widget, "name", str(widget)) if widget is not None else "none",
+                thing,
+            ]
+        )
+        self._map[wn] = name
+        if widget is not None:
+            widget.param.watch(self._signal, thing, onlychanged=True)
+        if auto and hasattr(self, name):
+            self.connect(name, getattr(self, name))
+    def _repr_mimebundle_(self, *args, **kwargs):
+        """Display in a notebook or a server"""
+        try:
+            return self.panel._repr_mimebundle_(*args, **kwargs)
+        except (ValueError, AttributeError) as exc:
+            raise NotImplementedError(
+                "Panel does not seem to be set up properly"
+            ) from exc
+    def connect(self, signal, slot):
+        """Associate call back with given event
+        The callback must be a function which takes the "new" value of the
+        watched attribute as the only parameter. If the callback return False,
+        this cancels any further processing of the given event.
+        Alternatively, the callback can be a string, in which case it means
+        emitting the correspondingly-named event (i.e., connect to self)
+        """
+        self._sigs[signal]["callbacks"].append(slot)
+    def _signal(self, event):
+        """This is called by a an action on a widget
+        Within an self.ignore_events context, nothing happens.
+        Tests can execute this method by directly changing the values of
+        widget components.
+        """
+        if not self._ignoring_events:
+            wn = "-".join([event.obj.name, event.name])
+            if wn in self._map and self._map[wn] in self._sigs:
+                self._emit(self._map[wn], event.new)
+    @contextlib.contextmanager
+    def ignore_events(self):
+        """Temporarily turn off events processing in this instance
+        (does not propagate to children)
+        """
+        self._ignoring_events = True
+        try:
+            yield
+        finally:
+            self._ignoring_events = False
+    def _emit(self, sig, value=None):
+        """An event happened, call its callbacks
+        This method can be used in tests to simulate message passing without
+        directly changing visual elements.
+        Calling of callbacks will halt whenever one returns False.
+        """
+        logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
+        for callback in self._sigs[sig]["callbacks"]:
+            if isinstance(callback, str):
+                self._emit(callback)
+            else:
+                try:
+                    # running callbacks should not break the interface
+                    ret = callback(value)
+                    if ret is False:
+                        break
+                except Exception as e:
+                    logger.exception(
+                        "Exception (%s) while executing callback for signal: %s",
+                        e,
+                        sig,
+                    )
+    def show(self, threads=False):
+        """Open a new browser tab and display this instance's interface"""
+        self.panel.show(threads=threads, verbose=False)
+        return self
+class SingleSelect(SigSlot):
+    """A multiselect which only allows you to select one item for an event"""
+    signals = ["_selected", "selected"]  # the first is internal
+    slots = ["set_options", "set_selection", "add", "clear", "select"]
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        super().__init__()
+    def _setup(self):
+        self.panel = pn.widgets.MultiSelect(**self.kwargs)
+        self._register(self.panel, "_selected", "value")
+        self._register(None, "selected")
+        self.connect("_selected", self.select_one)
+    def _signal(self, *args, **kwargs):
+        super()._signal(*args, **kwargs)
+    def select_one(self, *_):
+        with self.ignore_events():
+            val = [self.panel.value[-1]] if self.panel.value else []
+            self.panel.value = val
+        self._emit("selected", self.panel.value)
+    def set_options(self, options):
+        self.panel.options = options
+    def clear(self):
+        self.panel.options = []
+    @property
+    def value(self):
+        return self.panel.value
+    def set_selection(self, selection):
+        self.panel.value = [selection]
+class FileSelector(SigSlot):
+    """Panel-based graphical file selector widget
+    Instances of this widget are interactive and can be displayed in jupyter by having
+    them as the output of a cell,  or in a separate browser tab using ``.show()``.
+    """
+    signals = [
+        "protocol_changed",
+        "selection_changed",
+        "directory_entered",
+        "home_clicked",
+        "up_clicked",
+        "go_clicked",
+        "filters_changed",
+    ]
+    slots = ["set_filters", "go_home"]
+    def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
+        """
+        Parameters
+        ----------
+        url : str (optional)
+            Initial value of the URL to populate the dialog; should include protocol
+        filters : list(str) (optional)
+            File endings to include in the listings. If not included, all files are
+            allowed. Does not affect directories.
+            If given, the endings will appear as checkboxes in the interface
+        ignore : list(str) (optional)
+            Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
+            hidden files on posix
+        kwargs : dict (optional)
+            To pass to file system instance
+        """
+        if url:
+            self.init_protocol, url = split_protocol(url)
+        else:
+            self.init_protocol, url = "file", os.getcwd()
+        self.init_url = url
+        self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
+        self.filters = filters
+        self.ignore = [re.compile(i) for i in ignore or []]
+        self._fs = None
+        super().__init__()
+    def _setup(self):
+        self.url = pn.widgets.TextInput(
+            name="url",
+            value=self.init_url,
+            align="end",
+            sizing_mode="stretch_width",
+            width_policy="max",
+        )
+        self.protocol = pn.widgets.Select(
+            options=sorted(known_implementations),
+            value=self.init_protocol,
+            name="protocol",
+            align="center",
+        )
+        self.kwargs = pn.widgets.TextInput(
+            name="kwargs", value=self.init_kwargs, align="center"
+        )
+        self.go = pn.widgets.Button(name="⇨", align="end", width=45)
+        self.main = SingleSelect(size=10)
+        self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
+        self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
+        self._register(self.protocol, "protocol_changed", auto=True)
+        self._register(self.go, "go_clicked", "clicks", auto=True)
+        self._register(self.up, "up_clicked", "clicks", auto=True)
+        self._register(self.home, "home_clicked", "clicks", auto=True)
+        self._register(None, "selection_changed")
+        self.main.connect("selected", self.selection_changed)
+        self._register(None, "directory_entered")
+        self.prev_protocol = self.protocol.value
+        self.prev_kwargs = self.storage_options
+        self.filter_sel = pn.widgets.CheckBoxGroup(
+            value=[], options=[], inline=False, align="end", width_policy="min"
+        )
+        self._register(self.filter_sel, "filters_changed", auto=True)
+        self.panel = pn.Column(
+            pn.Row(self.protocol, self.kwargs),
+            pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
+            self.main.panel,
+        )
+        self.set_filters(self.filters)
+        self.go_clicked()
+    def set_filters(self, filters=None):
+        self.filters = filters
+        if filters:
+            self.filter_sel.options = filters
+            self.filter_sel.value = filters
+        else:
+            self.filter_sel.options = []
+            self.filter_sel.value = []
+    @property
+    def storage_options(self):
+        """Value of the kwargs box as a dictionary"""
+        return ast.literal_eval(self.kwargs.value) or {}
+    @property
+    def fs(self):
+        """Current filesystem instance"""
+        if self._fs is None:
+            cls = get_filesystem_class(self.protocol.value)
+            self._fs = cls(**self.storage_options)
+        return self._fs
+    @property
+    def urlpath(self):
+        """URL of currently selected item"""
+        return (
+            (f"{self.protocol.value}://{self.main.value[0]}")
+            if self.main.value
+            else None
+        )
+    def open_file(self, mode="rb", compression=None, encoding=None):
+        """Create OpenFile instance for the currently selected item
+        For example, in a notebook you might do something like
+        .. code-block::
+            [ ]: sel = FileSelector(); sel
+            # user selects their file
+            [ ]: with sel.open_file('rb') as f:
+            ...      out = f.read()
+        Parameters
+        ----------
+        mode: str (optional)
+            Open mode for the file.
+        compression: str (optional)
+            The interact with the file as compressed. Set to 'infer' to guess
+            compression from the file ending
+        encoding: str (optional)
+            If using text mode, use this encoding; defaults to UTF8.
+        """
+        if self.urlpath is None:
+            raise ValueError("No file selected")
+        return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
+    def filters_changed(self, values):
+        self.filters = values
+        self.go_clicked()
+    def selection_changed(self, *_):
+        if self.urlpath is None:
+            return
+        if self.fs.isdir(self.urlpath):
+            self.url.value = self.fs._strip_protocol(self.urlpath)
+        self.go_clicked()
+    def go_clicked(self, *_):
+        if (
+            self.prev_protocol != self.protocol.value
+            or self.prev_kwargs != self.storage_options
+        ):
+            self._fs = None  # causes fs to be recreated
+            self.prev_protocol = self.protocol.value
+            self.prev_kwargs = self.storage_options
+        listing = sorted(
+            self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
+        )
+        listing = [
+            l
+            for l in listing
+            if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
+        ]
+        folders = {
+            "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
+            for o in listing
+            if o["type"] == "directory"
+        }
+        files = {
+            "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
+            for o in listing
+            if o["type"] == "file"
+        }
+        if self.filters:
+            files = {
+                k: v
+                for k, v in files.items()
+                if any(v.endswith(ext) for ext in self.filters)
+            }
+        self.main.set_options(dict(**folders, **files))
+    def protocol_changed(self, *_):
+        self._fs = None
+        self.main.options = []
+        self.url.value = ""
+    def home_clicked(self, *_):
+        self.protocol.value = self.init_protocol
+        self.kwargs.value = self.init_kwargs
+        self.url.value = self.init_url
+        self.go_clicked()
+    def up_clicked(self, *_):
+        self.url.value = self.fs._parent(self.url.value)
+        self.go_clicked()

.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import errno
+import io
+import os
+import secrets
+import shutil
+from contextlib import suppress
+from functools import cached_property, wraps
+from urllib.parse import parse_qs
+from fsspec.spec import AbstractFileSystem
+from fsspec.utils import (
+    get_package_version_without_import,
+    infer_storage_options,
+    mirror_from,
+    tokenize,
+)
+def wrap_exceptions(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except OSError as exception:
+            if not exception.args:
+                raise
+            message, *args = exception.args
+            if isinstance(message, str) and "does not exist" in message:
+                raise FileNotFoundError(errno.ENOENT, message) from exception
+            else:
+                raise
+    return wrapper
+PYARROW_VERSION = None
+class ArrowFSWrapper(AbstractFileSystem):
+    """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
+    Parameters
+    ----------
+    fs : pyarrow.fs.FileSystem
+    """
+    root_marker = "/"
+    def __init__(self, fs, **kwargs):
+        global PYARROW_VERSION
+        PYARROW_VERSION = get_package_version_without_import("pyarrow")
+        self.fs = fs
+        super().__init__(**kwargs)
+    @property
+    def protocol(self):
+        return self.fs.type_name
+    @cached_property
+    def fsid(self):
+        return "hdfs_" + tokenize(self.fs.host, self.fs.port)
+    @classmethod
+    def _strip_protocol(cls, path):
+        ops = infer_storage_options(path)
+        path = ops["path"]
+        if path.startswith("//"):
+            # special case for "hdfs://path" (without the triple slash)
+            path = path[1:]
+        return path
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        from pyarrow.fs import FileSelector
+        entries = [
+            self._make_entry(entry)
+            for entry in self.fs.get_file_info(FileSelector(path))
+        ]
+        if detail:
+            return entries
+        else:
+            return [entry["name"] for entry in entries]
+    def info(self, path, **kwargs):
+        path = self._strip_protocol(path)
+        [info] = self.fs.get_file_info([path])
+        return self._make_entry(info)
+    def exists(self, path):
+        path = self._strip_protocol(path)
+        try:
+            self.info(path)
+        except FileNotFoundError:
+            return False
+        else:
+            return True
+    def _make_entry(self, info):
+        from pyarrow.fs import FileType
+        if info.type is FileType.Directory:
+            kind = "directory"
+        elif info.type is FileType.File:
+            kind = "file"
+        elif info.type is FileType.NotFound:
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
+        else:
+            kind = "other"
+        return {
+            "name": info.path,
+            "size": info.size,
+            "type": kind,
+            "mtime": info.mtime,
+        }
+    @wrap_exceptions
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+        with self._open(path1, "rb") as lstream:
+            tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.fs.move(tmp_fname, path2)
+            except BaseException:
+                with suppress(FileNotFoundError):
+                    self.fs.delete_file(tmp_fname)
+                raise
+    @wrap_exceptions
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+        self.fs.move(path1, path2)
+    @wrap_exceptions
+    def rm_file(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_file(path)
+    @wrap_exceptions
+    def rm(self, path, recursive=False, maxdepth=None):
+        path = self._strip_protocol(path).rstrip("/")
+        if self.isdir(path):
+            if recursive:
+                self.fs.delete_dir(path)
+            else:
+                raise ValueError("Can't delete directories without recursive=False")
+        else:
+            self.fs.delete_file(path)
+    @wrap_exceptions
+    def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
+        if mode == "rb":
+            if seekable:
+                method = self.fs.open_input_file
+            else:
+                method = self.fs.open_input_stream
+        elif mode == "wb":
+            method = self.fs.open_output_stream
+        elif mode == "ab":
+            method = self.fs.open_append_stream
+        else:
+            raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
+        _kwargs = {}
+        if mode != "rb" or not seekable:
+            if int(PYARROW_VERSION.split(".")[0]) >= 4:
+                # disable compression auto-detection
+                _kwargs["compression"] = None
+        stream = method(path, **_kwargs)
+        return ArrowFile(self, stream, path, mode, block_size, **kwargs)
+    @wrap_exceptions
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            self.fs.create_dir(path, recursive=False)
+    @wrap_exceptions
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        self.fs.create_dir(path, recursive=True)
+    @wrap_exceptions
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_dir(path)
+    @wrap_exceptions
+    def modified(self, path):
+        path = self._strip_protocol(path)
+        return self.fs.get_file_info(path).mtime
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        kwargs["seekable"] = start not in [None, 0]
+        return super().cat_file(path, start=None, end=None, **kwargs)
+    def get_file(self, rpath, lpath, **kwargs):
+        kwargs["seekable"] = False
+        super().get_file(rpath, lpath, **kwargs)
+@mirror_from(
+    "stream",
+    [
+        "read",
+        "seek",
+        "tell",
+        "write",
+        "readable",
+        "writable",
+        "close",
+        "size",
+        "seekable",
+    ],
+)
+class ArrowFile(io.IOBase):
+    def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
+        self.path = path
+        self.mode = mode
+        self.fs = fs
+        self.stream = stream
+        self.blocksize = self.block_size = block_size
+        self.kwargs = kwargs
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        return self.close()
+class HadoopFileSystem(ArrowFSWrapper):
+    """A wrapper on top of the pyarrow.fs.HadoopFileSystem
+    to connect it's interface with fsspec"""
+    protocol = "hdfs"
+    def __init__(
+        self,
+        host="default",
+        port=0,
+        user=None,
+        kerb_ticket=None,
+        replication=3,
+        extra_conf=None,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        host: str
+            Hostname, IP or "default" to try to read from Hadoop config
+        port: int
+            Port to connect on, or default from Hadoop config if 0
+        user: str or None
+            If given, connect as this username
+        kerb_ticket: str or None
+            If given, use this ticket for authentication
+        replication: int
+            set replication factor of file for write operations. default value is 3.
+        extra_conf: None or dict
+            Passed on to HadoopFileSystem
+        """
+        from pyarrow.fs import HadoopFileSystem
+        fs = HadoopFileSystem(
+            host=host,
+            port=port,
+            user=user,
+            kerb_ticket=kerb_ticket,
+            replication=replication,
+            extra_conf=extra_conf,
+        )
+        super().__init__(fs=fs, **kwargs)
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        ops = infer_storage_options(path)
+        out = {}
+        if ops.get("host", None):
+            out["host"] = ops["host"]
+        if ops.get("username", None):
+            out["user"] = ops["username"]
+        if ops.get("port", None):
+            out["port"] = ops["port"]
+        if ops.get("url_query", None):
+            queries = parse_qs(ops["url_query"])
+            if queries.get("replication", None):
+                out["replication"] = int(queries["replication"][0])
+        return out

.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import dask
+from distributed.client import Client, _get_global_client
+from distributed.worker import Worker
+from fsspec import filesystem
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+from fsspec.utils import infer_storage_options
+def _get_client(client):
+    if client is None:
+        return _get_global_client()
+    elif isinstance(client, Client):
+        return client
+    else:
+        # e.g., connection string
+        return Client(client)
+def _in_worker():
+    return bool(Worker._instances)
+class DaskWorkerFileSystem(AbstractFileSystem):
+    """View files accessible to a worker as any other remote file-system
+    When instances are run on the worker, uses the real filesystem. When
+    run on the client, they call the worker to provide information or data.
+    **Warning** this implementation is experimental, and read-only for now.
+    """
+    def __init__(
+        self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if not (fs is None) ^ (target_protocol is None):
+            raise ValueError(
+                "Please provide one of filesystem instance (fs) or"
+                " target_protocol, not both"
+            )
+        self.target_protocol = target_protocol
+        self.target_options = target_options
+        self.worker = None
+        self.client = client
+        self.fs = fs
+        self._determine_worker()
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        so = infer_storage_options(path)
+        if "host" in so and "port" in so:
+            return {"client": f"{so['host']}:{so['port']}"}
+        else:
+            return {}
+    def _determine_worker(self):
+        if _in_worker():
+            self.worker = True
+            if self.fs is None:
+                self.fs = filesystem(
+                    self.target_protocol, **(self.target_options or {})
+                )
+        else:
+            self.worker = False
+            self.client = _get_client(self.client)
+            self.rfs = dask.delayed(self)
+    def mkdir(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mkdir(*args, **kwargs)
+        else:
+            self.rfs.mkdir(*args, **kwargs).compute()
+    def rm(self, *args, **kwargs):
+        if self.worker:
+            self.fs.rm(*args, **kwargs)
+        else:
+            self.rfs.rm(*args, **kwargs).compute()
+    def copy(self, *args, **kwargs):
+        if self.worker:
+            self.fs.copy(*args, **kwargs)
+        else:
+            self.rfs.copy(*args, **kwargs).compute()
+    def mv(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mv(*args, **kwargs)
+        else:
+            self.rfs.mv(*args, **kwargs).compute()
+    def ls(self, *args, **kwargs):
+        if self.worker:
+            return self.fs.ls(*args, **kwargs)
+        else:
+            return self.rfs.ls(*args, **kwargs).compute()
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        if self.worker:
+            return self.fs._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        else:
+            return DaskFile(
+                fs=self,
+                path=path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+    def fetch_range(self, path, mode, start, end):
+        if self.worker:
+            with self._open(path, mode) as f:
+                f.seek(start)
+                return f.read(end - start)
+        else:
+            return self.rfs.fetch_range(path, mode, start, end).compute()
+class DaskFile(AbstractBufferedFile):
+    def __init__(self, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError('Remote dask files can only be opened in "rb" mode')
+        super().__init__(**kwargs)
+    def _upload_chunk(self, final=False):
+        pass
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        pass
+    def _fetch_range(self, start, end):
+        """Get the specified set of bytes from remote"""
+        return self.fs.fetch_range(self.path, self.mode, start, end)

.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import base64
+import urllib
+import requests
+import requests.exceptions
+from requests.adapters import HTTPAdapter, Retry
+from fsspec import AbstractFileSystem
+from fsspec.spec import AbstractBufferedFile
+class DatabricksException(Exception):
+    """
+    Helper class for exceptions raised in this module.
+    """
+    def __init__(self, error_code, message):
+        """Create a new DatabricksException"""
+        super().__init__(message)
+        self.error_code = error_code
+        self.message = message
+class DatabricksFileSystem(AbstractFileSystem):
+    """
+    Get access to the Databricks filesystem implementation over HTTP.
+    Can be used inside and outside of a databricks cluster.
+    """
+    def __init__(self, instance, token, **kwargs):
+        """
+        Create a new DatabricksFileSystem.
+        Parameters
+        ----------
+        instance: str
+            The instance URL of the databricks cluster.
+            For example for an Azure databricks cluster, this
+            has the form adb-<some-number>.<two digits>.azuredatabricks.net.
+        token: str
+            Your personal token. Find out more
+            here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
+        """
+        self.instance = instance
+        self.token = token
+        self.session = requests.Session()
+        self.retries = Retry(
+            total=10,
+            backoff_factor=0.05,
+            status_forcelist=[408, 429, 500, 502, 503, 504],
+        )
+        self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
+        self.session.headers.update({"Authorization": f"Bearer {self.token}"})
+        super().__init__(**kwargs)
+    def ls(self, path, detail=True, **kwargs):
+        """
+        List the contents of the given path.
+        Parameters
+        ----------
+        path: str
+            Absolute path
+        detail: bool
+            Return not only the list of filenames,
+            but also additional information on file sizes
+            and types.
+        """
+        out = self._ls_from_cache(path)
+        if not out:
+            try:
+                r = self._send_to_api(
+                    method="get", endpoint="list", json={"path": path}
+                )
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    raise FileNotFoundError(e.message) from e
+                raise
+            files = r["files"]
+            out = [
+                {
+                    "name": o["path"],
+                    "type": "directory" if o["is_dir"] else "file",
+                    "size": o["file_size"],
+                }
+                for o in files
+            ]
+            self.dircache[path] = out
+        if detail:
+            return out
+        return [o["name"] for o in out]
+    def makedirs(self, path, exist_ok=True):
+        """
+        Create a given absolute path and all of its parents.
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        exist_ok: bool
+            If false, checks if the folder
+            exists before creating it (and raises an
+            Exception if this is the case)
+        """
+        if not exist_ok:
+            try:
+                # If the following succeeds, the path is already present
+                self._send_to_api(
+                    method="get", endpoint="get-status", json={"path": path}
+                )
+                raise FileExistsError(f"Path {path} already exists")
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    pass
+        try:
+            self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+            raise
+        self.invalidate_cache(self._parent(path))
+    def mkdir(self, path, create_parents=True, **kwargs):
+        """
+        Create a given absolute path and all of its parents.
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        create_parents: bool
+            Whether to create all parents or not.
+            "False" is not implemented so far.
+        """
+        if not create_parents:
+            raise NotImplementedError
+        self.mkdirs(path, **kwargs)
+    def rm(self, path, recursive=False, **kwargs):
+        """
+        Remove the file or folder at the given absolute path.
+        Parameters
+        ----------
+        path: str
+            Absolute path what to remove
+        recursive: bool
+            Recursively delete all files in a folder.
+        """
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="delete",
+                json={"path": path, "recursive": recursive},
+            )
+        except DatabricksException as e:
+            # This is not really an exception, it just means
+            # not everything was deleted so far
+            if e.error_code == "PARTIAL_DELETE":
+                self.rm(path=path, recursive=recursive)
+            elif e.error_code == "IO_ERROR":
+                # Using the same exception as the os module would use here
+                raise OSError(e.message) from e
+            raise
+        self.invalidate_cache(self._parent(path))
+    def mv(
+        self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
+    ):
+        """
+        Move a source to a destination path.
+        A note from the original [databricks API manual]
+        (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
+        When moving a large number of files the API call will time out after
+        approximately 60s, potentially resulting in partially moved data.
+        Therefore, for operations that move more than 10k files, we strongly
+        discourage using the DBFS REST API.
+        Parameters
+        ----------
+        source_path: str
+            From where to move (absolute path)
+        destination_path: str
+            To where to move (absolute path)
+        recursive: bool
+            Not implemented to far.
+        maxdepth:
+            Not implemented to far.
+        """
+        if recursive:
+            raise NotImplementedError
+        if maxdepth:
+            raise NotImplementedError
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="move",
+                json={"source_path": source_path, "destination_path": destination_path},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+            raise
+        self.invalidate_cache(self._parent(source_path))
+        self.invalidate_cache(self._parent(destination_path))
+    def _open(self, path, mode="rb", block_size="default", **kwargs):
+        """
+        Overwrite the base class method to make sure to create a DBFile.
+        All arguments are copied from the base method.
+        Only the default blocksize is allowed.
+        """
+        return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
+    def _send_to_api(self, method, endpoint, json):
+        """
+        Send the given json to the DBFS API
+        using a get or post request (specified by the argument `method`).
+        Parameters
+        ----------
+        method: str
+            Which http method to use for communication; "get" or "post".
+        endpoint: str
+            Where to send the request to (last part of the API URL)
+        json: dict
+            Dictionary of information to send
+        """
+        if method == "post":
+            session_call = self.session.post
+        elif method == "get":
+            session_call = self.session.get
+        else:
+            raise ValueError(f"Do not understand method {method}")
+        url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
+        r = session_call(url, json=json)
+        # The DBFS API will return a json, also in case of an exception.
+        # We want to preserve this information as good as possible.
+        try:
+            r.raise_for_status()
+        except requests.HTTPError as e:
+            # try to extract json error message
+            # if that fails, fall back to the original exception
+            try:
+                exception_json = e.response.json()
+            except Exception:
+                raise e from None
+            raise DatabricksException(**exception_json) from e
+        return r.json()
+    def _create_handle(self, path, overwrite=True):
+        """
+        Internal function to create a handle, which can be used to
+        write blocks of a file to DBFS.
+        A handle has a unique identifier which needs to be passed
+        whenever written during this transaction.
+        The handle is active for 10 minutes - after that a new
+        write transaction needs to be created.
+        Make sure to close the handle after you are finished.
+        Parameters
+        ----------
+        path: str
+            Absolute path for this file.
+        overwrite: bool
+            If a file already exist at this location, either overwrite
+            it or raise an exception.
+        """
+        try:
+            r = self._send_to_api(
+                method="post",
+                endpoint="create",
+                json={"path": path, "overwrite": overwrite},
+            )
+            return r["handle"]
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message) from e
+            raise
+    def _close_handle(self, handle):
+        """
+        Close a handle, which was opened by :func:`_create_handle`.
+        Parameters
+        ----------
+        handle: str
+            Which handle to close.
+        """
+        try:
+            self._send_to_api(method="post", endpoint="close", json={"handle": handle})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            raise
+    def _add_data(self, handle, data):
+        """
+        Upload data to an already opened file handle
+        (opened by :func:`_create_handle`).
+        The maximal allowed data size is 1MB after
+        conversion to base64.
+        Remember to close the handle when you are finished.
+        Parameters
+        ----------
+        handle: str
+            Which handle to upload data to.
+        data: bytes
+            Block of data to add to the handle.
+        """
+        data = base64.b64encode(data).decode()
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="add-block",
+                json={"handle": handle, "data": data},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
+                raise ValueError(e.message) from e
+            raise
+    def _get_data(self, path, start, end):
+        """
+        Download data in bytes from a given absolute path in a block
+        from [start, start+length].
+        The maximum number of allowed bytes to read is 1MB.
+        Parameters
+        ----------
+        path: str
+            Absolute path to download data from
+        start: int
+            Start position of the block
+        end: int
+            End position of the block
+        """
+        try:
+            r = self._send_to_api(
+                method="get",
+                endpoint="read",
+                json={"path": path, "offset": start, "length": end - start},
+            )
+            return base64.b64decode(r["data"])
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message) from e
+            elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
+                raise ValueError(e.message) from e
+            raise
+    def invalidate_cache(self, path=None):
+        if path is None:
+            self.dircache.clear()
+        else:
+            self.dircache.pop(path, None)
+        super().invalidate_cache(path)
+class DatabricksFile(AbstractBufferedFile):
+    """
+    Helper class for files referenced in the DatabricksFileSystem.
+    """
+    DEFAULT_BLOCK_SIZE = 1 * 2**20  # only allowed block size
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        **kwargs,
+    ):
+        """
+        Create a new instance of the DatabricksFile.
+        The blocksize needs to be the default one.
+        """
+        if block_size is None or block_size == "default":
+            block_size = self.DEFAULT_BLOCK_SIZE
+        assert block_size == self.DEFAULT_BLOCK_SIZE, (
+            f"Only the default block size is allowed, not {block_size}"
+        )
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_type=cache_type,
+            cache_options=cache_options or {},
+            **kwargs,
+        )
+    def _initiate_upload(self):
+        """Internal function to start a file upload"""
+        self.handle = self.fs._create_handle(self.path)
+    def _upload_chunk(self, final=False):
+        """Internal function to add a chunk of data to a started upload"""
+        self.buffer.seek(0)
+        data = self.buffer.getvalue()
+        data_chunks = [
+            data[start:end] for start, end in self._to_sized_blocks(len(data))
+        ]
+        for data_chunk in data_chunks:
+            self.fs._add_data(handle=self.handle, data=data_chunk)
+        if final:
+            self.fs._close_handle(handle=self.handle)
+            return True
+    def _fetch_range(self, start, end):
+        """Internal function to download a block of data"""
+        return_buffer = b""
+        length = end - start
+        for chunk_start, chunk_end in self._to_sized_blocks(length, start):
+            return_buffer += self.fs._get_data(
+                path=self.path, start=chunk_start, end=chunk_end
+            )
+        return return_buffer
+    def _to_sized_blocks(self, length, start=0):
+        """Helper function to split a range from 0 to total_length into bloksizes"""
+        end = start + length
+        for data_chunk in range(start, end, self.blocksize):
+            data_start = data_chunk
+            data_end = min(end, data_chunk + self.blocksize)
+            yield data_start, data_end

.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py ADDED Viewed

	@@ -0,0 +1,384 @@

+from .. import filesystem
+from ..asyn import AsyncFileSystem
+class DirFileSystem(AsyncFileSystem):
+    """Directory prefix filesystem
+    The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
+    is relative to the `path`. After performing the necessary paths operation it
+    delegates everything to the wrapped filesystem.
+    """
+    protocol = "dir"
+    def __init__(
+        self,
+        path=None,
+        fs=None,
+        fo=None,
+        target_protocol=None,
+        target_options=None,
+        **storage_options,
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            Path to the directory.
+        fs: AbstractFileSystem
+            An instantiated filesystem to wrap.
+        target_protocol, target_options:
+            if fs is none, construct it from these
+        fo: str
+            Alternate for path; do not provide both
+        """
+        super().__init__(**storage_options)
+        if fs is None:
+            fs = filesystem(protocol=target_protocol, **(target_options or {}))
+        if (path is not None) ^ (fo is not None) is False:
+            raise ValueError("Provide path or fo, not both")
+        path = path or fo
+        if self.asynchronous and not fs.async_impl:
+            raise ValueError("can't use asynchronous with non-async fs")
+        if fs.async_impl and self.asynchronous != fs.asynchronous:
+            raise ValueError("both dirfs and fs should be in the same sync/async mode")
+        self.path = fs._strip_protocol(path)
+        self.fs = fs
+    def _join(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            if not path:
+                return self.path
+            return self.fs.sep.join((self.path, self._strip_protocol(path)))
+        if isinstance(path, dict):
+            return {self._join(_path): value for _path, value in path.items()}
+        return [self._join(_path) for _path in path]
+    def _relpath(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            # We need to account for S3FileSystem returning paths that do not
+            # start with a '/'
+            if path == self.path or (
+                self.path.startswith(self.fs.sep) and path == self.path[1:]
+            ):
+                return ""
+            prefix = self.path + self.fs.sep
+            if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
+                prefix = prefix[1:]
+            assert path.startswith(prefix)
+            return path[len(prefix) :]
+        return [self._relpath(_path) for _path in path]
+    # Wrappers below
+    @property
+    def sep(self):
+        return self.fs.sep
+    async def set_session(self, *args, **kwargs):
+        return await self.fs.set_session(*args, **kwargs)
+    async def _rm_file(self, path, **kwargs):
+        return await self.fs._rm_file(self._join(path), **kwargs)
+    def rm_file(self, path, **kwargs):
+        return self.fs.rm_file(self._join(path), **kwargs)
+    async def _rm(self, path, *args, **kwargs):
+        return await self.fs._rm(self._join(path), *args, **kwargs)
+    def rm(self, path, *args, **kwargs):
+        return self.fs.rm(self._join(path), *args, **kwargs)
+    async def _cp_file(self, path1, path2, **kwargs):
+        return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
+    def cp_file(self, path1, path2, **kwargs):
+        return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
+    async def _copy(
+        self,
+        path1,
+        path2,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+    def copy(self, path1, path2, *args, **kwargs):
+        return self.fs.copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+    async def _pipe(self, path, *args, **kwargs):
+        return await self.fs._pipe(self._join(path), *args, **kwargs)
+    def pipe(self, path, *args, **kwargs):
+        return self.fs.pipe(self._join(path), *args, **kwargs)
+    async def _pipe_file(self, path, *args, **kwargs):
+        return await self.fs._pipe_file(self._join(path), *args, **kwargs)
+    def pipe_file(self, path, *args, **kwargs):
+        return self.fs.pipe_file(self._join(path), *args, **kwargs)
+    async def _cat_file(self, path, *args, **kwargs):
+        return await self.fs._cat_file(self._join(path), *args, **kwargs)
+    def cat_file(self, path, *args, **kwargs):
+        return self.fs.cat_file(self._join(path), *args, **kwargs)
+    async def _cat(self, path, *args, **kwargs):
+        ret = await self.fs._cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+        return ret
+    def cat(self, path, *args, **kwargs):
+        ret = self.fs.cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+        return ret
+    async def _put_file(self, lpath, rpath, **kwargs):
+        return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
+    def put_file(self, lpath, rpath, **kwargs):
+        return self.fs.put_file(lpath, self._join(rpath), **kwargs)
+    async def _put(
+        self,
+        lpath,
+        rpath,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+    def put(self, lpath, rpath, *args, **kwargs):
+        return self.fs.put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+    async def _get_file(self, rpath, lpath, **kwargs):
+        return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
+    def get_file(self, rpath, lpath, **kwargs):
+        return self.fs.get_file(self._join(rpath), lpath, **kwargs)
+    async def _get(self, rpath, *args, **kwargs):
+        return await self.fs._get(self._join(rpath), *args, **kwargs)
+    def get(self, rpath, *args, **kwargs):
+        return self.fs.get(self._join(rpath), *args, **kwargs)
+    async def _isfile(self, path):
+        return await self.fs._isfile(self._join(path))
+    def isfile(self, path):
+        return self.fs.isfile(self._join(path))
+    async def _isdir(self, path):
+        return await self.fs._isdir(self._join(path))
+    def isdir(self, path):
+        return self.fs.isdir(self._join(path))
+    async def _size(self, path):
+        return await self.fs._size(self._join(path))
+    def size(self, path):
+        return self.fs.size(self._join(path))
+    async def _exists(self, path):
+        return await self.fs._exists(self._join(path))
+    def exists(self, path):
+        return self.fs.exists(self._join(path))
+    async def _info(self, path, **kwargs):
+        return await self.fs._info(self._join(path), **kwargs)
+    def info(self, path, **kwargs):
+        return self.fs.info(self._join(path), **kwargs)
+    async def _ls(self, path, detail=True, **kwargs):
+        ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+        return self._relpath(ret)
+    def ls(self, path, detail=True, **kwargs):
+        ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+        return self._relpath(ret)
+    async def _walk(self, path, *args, **kwargs):
+        async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+    def walk(self, path, *args, **kwargs):
+        for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+    async def _glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+    def glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+    async def _du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = await self.fs._du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+        return {self._relpath(path): size for path, size in ret.items()}
+    def du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = self.fs.du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+        return {self._relpath(path): size for path, size in ret.items()}
+    async def _find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+    def find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+    async def _expand_path(self, path, *args, **kwargs):
+        return self._relpath(
+            await self.fs._expand_path(self._join(path), *args, **kwargs)
+        )
+    def expand_path(self, path, *args, **kwargs):
+        return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
+    async def _mkdir(self, path, *args, **kwargs):
+        return await self.fs._mkdir(self._join(path), *args, **kwargs)
+    def mkdir(self, path, *args, **kwargs):
+        return self.fs.mkdir(self._join(path), *args, **kwargs)
+    async def _makedirs(self, path, *args, **kwargs):
+        return await self.fs._makedirs(self._join(path), *args, **kwargs)
+    def makedirs(self, path, *args, **kwargs):
+        return self.fs.makedirs(self._join(path), *args, **kwargs)
+    def rmdir(self, path):
+        return self.fs.rmdir(self._join(path))
+    def mv(self, path1, path2, **kwargs):
+        return self.fs.mv(
+            self._join(path1),
+            self._join(path2),
+            **kwargs,
+        )
+    def touch(self, path, **kwargs):
+        return self.fs.touch(self._join(path), **kwargs)
+    def created(self, path):
+        return self.fs.created(self._join(path))
+    def modified(self, path):
+        return self.fs.modified(self._join(path))
+    def sign(self, path, *args, **kwargs):
+        return self.fs.sign(self._join(path), *args, **kwargs)
+    def __repr__(self):
+        return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
+    def open(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return self.fs.open(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+    async def open_async(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs.open_async(
+            self._join(path),
+            *args,
+            **kwargs,
+        )

.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import base64
+import io
+import re
+import requests
+import fsspec
+class JupyterFileSystem(fsspec.AbstractFileSystem):
+    """View of the files as seen by a Jupyter server (notebook or lab)"""
+    protocol = ("jupyter", "jlab")
+    def __init__(self, url, tok=None, **kwargs):
+        """
+        Parameters
+        ----------
+        url : str
+            Base URL of the server, like "http://127.0.0.1:8888". May include
+            token in the string, which is given by the process when starting up
+        tok : str
+            If the token is obtained separately, can be given here
+        kwargs
+        """
+        if "?" in url:
+            if tok is None:
+                try:
+                    tok = re.findall("token=([a-z0-9]+)", url)[0]
+                except IndexError as e:
+                    raise ValueError("Could not determine token") from e
+            url = url.split("?", 1)[0]
+        self.url = url.rstrip("/") + "/api/contents"
+        self.session = requests.Session()
+        if tok:
+            self.session.headers["Authorization"] = f"token {tok}"
+        super().__init__(**kwargs)
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            return FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+        if out["type"] == "directory":
+            out = out["content"]
+        else:
+            out = [out]
+        for o in out:
+            o["name"] = o.pop("path")
+            o.pop("content")
+            if o["type"] == "notebook":
+                o["type"] = "file"
+        if detail:
+            return out
+        return [o["name"] for o in out]
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            return FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+        if out["format"] == "text":
+            # data should be binary
+            b = out["content"].encode()
+        else:
+            b = base64.b64decode(out["content"])
+        return b[start:end]
+    def pipe_file(self, path, value, **_):
+        path = self._strip_protocol(path)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": len(value),
+            "content": base64.b64encode(value).decode(),
+            "format": "base64",
+            "type": "file",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents and "/" in path:
+            self.mkdir(path.rsplit("/", 1)[0], True)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": None,
+            "content": None,
+            "type": "directory",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        self.session.delete(f"{self.url}/{path}")
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        if mode == "rb":
+            data = self.cat_file(path)
+            return io.BytesIO(data)
+        else:
+            return SimpleFileWriter(self, path, mode="wb")
+class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
+    def _upload_chunk(self, final=False):
+        """Never uploads a chunk until file is done
+        Not suitable for large files
+        """
+        if final is False:
+            return False
+        self.buffer.seek(0)
+        data = self.buffer.read()
+        self.fs.pipe_file(self.path, data)

.venv/lib/python3.11/site-packages/fsspec/implementations/local.py ADDED Viewed

	@@ -0,0 +1,476 @@

+import datetime
+import io
+import logging
+import os
+import os.path as osp
+import shutil
+import stat
+import tempfile
+from fsspec import AbstractFileSystem
+from fsspec.compression import compr
+from fsspec.core import get_compression
+from fsspec.utils import isfilelike, stringify_path
+logger = logging.getLogger("fsspec.local")
+class LocalFileSystem(AbstractFileSystem):
+    """Interface to files on local storage
+    Parameters
+    ----------
+    auto_mkdir: bool
+        Whether, when opening a file, the directory containing it should
+        be created (if it doesn't already exist). This is assumed by pyarrow
+        code.
+    """
+    root_marker = "/"
+    protocol = "file", "local"
+    local_file = True
+    def __init__(self, auto_mkdir=False, **kwargs):
+        super().__init__(**kwargs)
+        self.auto_mkdir = auto_mkdir
+    @property
+    def fsid(self):
+        return "local"
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.exists(path):
+            raise FileExistsError(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            os.mkdir(path, **kwargs)
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        os.makedirs(path, exist_ok=exist_ok)
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        os.rmdir(path)
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        info = self.info(path)
+        if info["type"] == "directory":
+            with os.scandir(path) as it:
+                infos = []
+                for f in it:
+                    try:
+                        infos.append(self.info(f))
+                    except FileNotFoundError:
+                        pass
+        else:
+            infos = [info]
+        if not detail:
+            return [i["name"] for i in infos]
+        return infos
+    def info(self, path, **kwargs):
+        if isinstance(path, os.DirEntry):
+            # scandir DirEntry
+            out = path.stat(follow_symlinks=False)
+            link = path.is_symlink()
+            if path.is_dir(follow_symlinks=False):
+                t = "directory"
+            elif path.is_file(follow_symlinks=False):
+                t = "file"
+            else:
+                t = "other"
+            size = out.st_size
+            if link:
+                try:
+                    out2 = path.stat(follow_symlinks=True)
+                    size = out2.st_size
+                except OSError:
+                    size = 0
+            path = self._strip_protocol(path.path)
+        else:
+            # str or path-like
+            path = self._strip_protocol(path)
+            out = os.stat(path, follow_symlinks=False)
+            link = stat.S_ISLNK(out.st_mode)
+            if link:
+                out = os.stat(path, follow_symlinks=True)
+            size = out.st_size
+            if stat.S_ISDIR(out.st_mode):
+                t = "directory"
+            elif stat.S_ISREG(out.st_mode):
+                t = "file"
+            else:
+                t = "other"
+        result = {
+            "name": path,
+            "size": size,
+            "type": t,
+            "created": out.st_ctime,
+            "islink": link,
+        }
+        for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
+            result[field] = getattr(out, f"st_{field}")
+        if link:
+            result["destination"] = os.readlink(path)
+        return result
+    def lexists(self, path, **kwargs):
+        return osp.lexists(path)
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path2), exist_ok=True)
+        if self.isfile(path1):
+            shutil.copyfile(path1, path2)
+        elif self.isdir(path1):
+            self.mkdirs(path2, exist_ok=True)
+        else:
+            raise FileNotFoundError(path1)
+    def isfile(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isfile(path)
+    def isdir(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isdir(path)
+    def get_file(self, path1, path2, callback=None, **kwargs):
+        if isfilelike(path2):
+            with open(path1, "rb") as f:
+                shutil.copyfileobj(f, path2)
+        else:
+            return self.cp_file(path1, path2, **kwargs)
+    def put_file(self, path1, path2, callback=None, **kwargs):
+        return self.cp_file(path1, path2, **kwargs)
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        shutil.move(path1, path2)
+    def link(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.link(src, dst, **kwargs)
+    def symlink(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.symlink(src, dst, **kwargs)
+    def islink(self, path) -> bool:
+        return os.path.islink(self._strip_protocol(path))
+    def rm_file(self, path):
+        os.remove(self._strip_protocol(path))
+    def rm(self, path, recursive=False, maxdepth=None):
+        if not isinstance(path, list):
+            path = [path]
+        for p in path:
+            p = self._strip_protocol(p)
+            if self.isdir(p):
+                if not recursive:
+                    raise ValueError("Cannot delete directory, set recursive=True")
+                if osp.abspath(p) == os.getcwd():
+                    raise ValueError("Cannot delete current working directory")
+                shutil.rmtree(p)
+            else:
+                os.remove(p)
+    def unstrip_protocol(self, name):
+        name = self._strip_protocol(name)  # normalise for local/win/...
+        return f"file://{name}"
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir and "w" in mode:
+            self.makedirs(self._parent(path), exist_ok=True)
+        return LocalFileOpener(path, mode, fs=self, **kwargs)
+    def touch(self, path, truncate=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path), exist_ok=True)
+        if self.exists(path):
+            os.utime(path, None)
+        else:
+            open(path, "a").close()
+        if truncate:
+            os.truncate(path, 0)
+    def created(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(
+            info["created"], tz=datetime.timezone.utc
+        )
+    def modified(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
+    @classmethod
+    def _parent(cls, path):
+        path = cls._strip_protocol(path)
+        if os.sep == "/":
+            # posix native
+            return path.rsplit("/", 1)[0] or "/"
+        else:
+            # NT
+            path_ = path.rsplit("/", 1)[0]
+            if len(path_) <= 3:
+                if path_[1:2] == ":":
+                    # nt root (something like c:/)
+                    return path_[0] + ":/"
+            # More cases may be required here
+            return path_
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = stringify_path(path)
+        if path.startswith("file://"):
+            path = path[7:]
+        elif path.startswith("file:"):
+            path = path[5:]
+        elif path.startswith("local://"):
+            path = path[8:]
+        elif path.startswith("local:"):
+            path = path[6:]
+        path = make_path_posix(path)
+        if os.sep != "/":
+            # This code-path is a stripped down version of
+            # > drive, path = ntpath.splitdrive(path)
+            if path[1:2] == ":":
+                # Absolute drive-letter path, e.g. X:\Windows
+                # Relative path with drive, e.g. X:Windows
+                drive, path = path[:2], path[2:]
+            elif path[:2] == "//":
+                # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+                # Device drives, e.g. \\.\device or \\?\device
+                if (index1 := path.find("/", 2)) == -1 or (
+                    index2 := path.find("/", index1 + 1)
+                ) == -1:
+                    drive, path = path, ""
+                else:
+                    drive, path = path[:index2], path[index2:]
+            else:
+                # Relative path, e.g. Windows
+                drive = ""
+            path = path.rstrip("/") or cls.root_marker
+            return drive + path
+        else:
+            return path.rstrip("/") or cls.root_marker
+    def _isfilestore(self):
+        # Inheriting from DaskFileSystem makes this False (S3, etc. were)
+        # the original motivation. But we are a posix-like file system.
+        # See https://github.com/dask/dask/issues/5526
+        return True
+    def chmod(self, path, mode):
+        path = stringify_path(path)
+        return os.chmod(path, mode)
+def make_path_posix(path):
+    """Make path generic and absolute for current OS"""
+    if not isinstance(path, str):
+        if isinstance(path, (list, set, tuple)):
+            return type(path)(make_path_posix(p) for p in path)
+        else:
+            path = stringify_path(path)
+            if not isinstance(path, str):
+                raise TypeError(f"could not convert {path!r} to string")
+    if os.sep == "/":
+        # Native posix
+        if path.startswith("/"):
+            # most common fast case for posix
+            return path
+        elif path.startswith("~"):
+            return osp.expanduser(path)
+        elif path.startswith("./"):
+            path = path[2:]
+        elif path == ".":
+            path = ""
+        return f"{os.getcwd()}/{path}"
+    else:
+        # NT handling
+        if path[0:1] == "/" and path[2:3] == ":":
+            # path is like "/c:/local/path"
+            path = path[1:]
+        if path[1:2] == ":":
+            # windows full path like "C:\\local\\path"
+            if len(path) <= 3:
+                # nt root (something like c:/)
+                return path[0] + ":/"
+            path = path.replace("\\", "/")
+            return path
+        elif path[0:1] == "~":
+            return make_path_posix(osp.expanduser(path))
+        elif path.startswith(("\\\\", "//")):
+            # windows UNC/DFS-style paths
+            return "//" + path[2:].replace("\\", "/")
+        elif path.startswith(("\\", "/")):
+            # windows relative path with root
+            path = path.replace("\\", "/")
+            return f"{osp.splitdrive(os.getcwd())[0]}{path}"
+        else:
+            path = path.replace("\\", "/")
+            if path.startswith("./"):
+                path = path[2:]
+            elif path == ".":
+                path = ""
+            return f"{make_path_posix(os.getcwd())}/{path}"
+def trailing_sep(path):
+    """Return True if the path ends with a path separator.
+    A forward slash is always considered a path separator, even on Operating
+    Systems that normally use a backslash.
+    """
+    # TODO: if all incoming paths were posix-compliant then separator would
+    # always be a forward slash, simplifying this function.
+    # See https://github.com/fsspec/filesystem_spec/pull/1250
+    return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
+class LocalFileOpener(io.IOBase):
+    def __init__(
+        self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
+    ):
+        logger.debug("open file: %s", path)
+        self.path = path
+        self.mode = mode
+        self.fs = fs
+        self.f = None
+        self.autocommit = autocommit
+        self.compression = get_compression(path, compression)
+        self.blocksize = io.DEFAULT_BUFFER_SIZE
+        self._open()
+    def _open(self):
+        if self.f is None or self.f.closed:
+            if self.autocommit or "w" not in self.mode:
+                self.f = open(self.path, mode=self.mode)
+                if self.compression:
+                    compress = compr[self.compression]
+                    self.f = compress(self.f, mode=self.mode)
+            else:
+                # TODO: check if path is writable?
+                i, name = tempfile.mkstemp()
+                os.close(i)  # we want normal open and normal buffered file
+                self.temp = name
+                self.f = open(name, mode=self.mode)
+            if "w" not in self.mode:
+                self.size = self.f.seek(0, 2)
+                self.f.seek(0)
+                self.f.size = self.size
+    def _fetch_range(self, start, end):
+        # probably only used by cached FS
+        if "r" not in self.mode:
+            raise ValueError
+        self._open()
+        self.f.seek(start)
+        return self.f.read(end - start)
+    def __setstate__(self, state):
+        self.f = None
+        loc = state.pop("loc", None)
+        self.__dict__.update(state)
+        if "r" in state["mode"]:
+            self.f = None
+            self._open()
+            self.f.seek(loc)
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        d.pop("f")
+        if "r" in self.mode:
+            d["loc"] = self.f.tell()
+        else:
+            if not self.f.closed:
+                raise ValueError("Cannot serialise open write-mode local file")
+        return d
+    def commit(self):
+        if self.autocommit:
+            raise RuntimeError("Can only commit if not already set to autocommit")
+        shutil.move(self.temp, self.path)
+    def discard(self):
+        if self.autocommit:
+            raise RuntimeError("Cannot discard if set to autocommit")
+        os.remove(self.temp)
+    def readable(self) -> bool:
+        return True
+    def writable(self) -> bool:
+        return "r" not in self.mode
+    def read(self, *args, **kwargs):
+        return self.f.read(*args, **kwargs)
+    def write(self, *args, **kwargs):
+        return self.f.write(*args, **kwargs)
+    def tell(self, *args, **kwargs):
+        return self.f.tell(*args, **kwargs)
+    def seek(self, *args, **kwargs):
+        return self.f.seek(*args, **kwargs)
+    def seekable(self, *args, **kwargs):
+        return self.f.seekable(*args, **kwargs)
+    def readline(self, *args, **kwargs):
+        return self.f.readline(*args, **kwargs)
+    def readlines(self, *args, **kwargs):
+        return self.f.readlines(*args, **kwargs)
+    def close(self):
+        return self.f.close()
+    def truncate(self, size=None) -> int:
+        return self.f.truncate(size)
+    @property
+    def closed(self):
+        return self.f.closed
+    def fileno(self):
+        return self.raw.fileno()
+    def flush(self) -> None:
+        self.f.flush()
+    def __iter__(self):
+        return self.f.__iter__()
+    def __getattr__(self, item):
+        return getattr(self.f, item)
+    def __enter__(self):
+        self._incontext = True
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._incontext = False
+        self.f.__exit__(exc_type, exc_value, traceback)

.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py ADDED Viewed

	@@ -0,0 +1,1306 @@

+import base64
+import collections
+import io
+import itertools
+import logging
+import math
+import os
+from functools import lru_cache
+from itertools import chain
+from typing import TYPE_CHECKING, Literal
+import fsspec.core
+from fsspec.spec import AbstractBufferedFile
+try:
+    import ujson as json
+except ImportError:
+    if not TYPE_CHECKING:
+        import json
+from fsspec.asyn import AsyncFileSystem
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.core import filesystem, open, split_protocol
+from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
+from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
+logger = logging.getLogger("fsspec.reference")
+class ReferenceNotReachable(RuntimeError):
+    def __init__(self, reference, target, *args):
+        super().__init__(*args)
+        self.reference = reference
+        self.target = target
+    def __str__(self):
+        return f'Reference "{self.reference}" failed to fetch target {self.target}'
+def _first(d):
+    return next(iter(d.values()))
+def _prot_in_references(path, references):
+    ref = references.get(path)
+    if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
+        return split_protocol(ref[0])[0] if ref[0] else ref[0]
+def _protocol_groups(paths, references):
+    if isinstance(paths, str):
+        return {_prot_in_references(paths, references): [paths]}
+    out = {}
+    for path in paths:
+        protocol = _prot_in_references(path, references)
+        out.setdefault(protocol, []).append(path)
+    return out
+class RefsValuesView(collections.abc.ValuesView):
+    def __iter__(self):
+        for val in self._mapping.zmetadata.values():
+            yield json.dumps(val).encode()
+        yield from self._mapping._items.values()
+        for field in self._mapping.listdir():
+            chunk_sizes = self._mapping._get_chunk_sizes(field)
+            if len(chunk_sizes) == 0:
+                yield self._mapping[field + "/0"]
+                continue
+            yield from self._mapping._generate_all_records(field)
+class RefsItemsView(collections.abc.ItemsView):
+    def __iter__(self):
+        return zip(self._mapping.keys(), self._mapping.values())
+def ravel_multi_index(idx, sizes):
+    val = 0
+    mult = 1
+    for i, s in zip(idx[::-1], sizes[::-1]):
+        val += i * mult
+        mult *= s
+    return val
+class LazyReferenceMapper(collections.abc.MutableMapping):
+    """This interface can be used to read/write references from Parquet stores.
+    It is not intended for other types of references.
+    It can be used with Kerchunk's MultiZarrToZarr method to combine
+    references into a parquet store.
+    Examples of this use-case can be found here:
+    https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
+    # import is class level to prevent numpy dep requirement for fsspec
+    @property
+    def np(self):
+        import numpy as np
+        return np
+    @property
+    def pd(self):
+        import pandas as pd
+        return pd
+    def __init__(
+        self,
+        root,
+        fs=None,
+        out_root=None,
+        cache_size=128,
+        categorical_threshold=10,
+        engine: Literal["fastparquet", "pyarrow"] = "fastparquet",
+    ):
+        """
+        This instance will be writable, storing changes in memory until full partitions
+        are accumulated or .flush() is called.
+        To create an empty lazy store, use .create()
+        Parameters
+        ----------
+        root : str
+            Root of parquet store
+        fs : fsspec.AbstractFileSystem
+            fsspec filesystem object, default is local filesystem.
+        cache_size : int, default=128
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+        categorical_threshold : int
+            Encode urls as pandas.Categorical to reduce memory footprint if the ratio
+            of the number of unique urls to total number of refs for each variable
+            is greater than or equal to this number. (default 10)
+        engine: Literal["fastparquet","pyarrow"]
+            Engine choice for reading parquet files. (default is "fastparquet")
+        """
+        self.root = root
+        self.chunk_sizes = {}
+        self.out_root = out_root or self.root
+        self.cat_thresh = categorical_threshold
+        self.engine = engine
+        self.cache_size = cache_size
+        self.url = self.root + "/{field}/refs.{record}.parq"
+        # TODO: derive fs from `root`
+        self.fs = fsspec.filesystem("file") if fs is None else fs
+        from importlib.util import find_spec
+        if self.engine == "pyarrow" and find_spec("pyarrow") is None:
+            raise ImportError("engine choice `pyarrow` is not installed.")
+    def __getattr__(self, item):
+        if item in ("_items", "record_size", "zmetadata"):
+            self.setup()
+            # avoid possible recursion if setup fails somehow
+            return self.__dict__[item]
+        raise AttributeError(item)
+    def setup(self):
+        self._items = {}
+        self._items[".zmetadata"] = self.fs.cat_file(
+            "/".join([self.root, ".zmetadata"])
+        )
+        met = json.loads(self._items[".zmetadata"])
+        self.record_size = met["record_size"]
+        self.zmetadata = met["metadata"]
+        # Define function to open and decompress refs
+        @lru_cache(maxsize=self.cache_size)
+        def open_refs(field, record):
+            """cached parquet file loader"""
+            path = self.url.format(field=field, record=record)
+            data = io.BytesIO(self.fs.cat_file(path))
+            try:
+                df = self.pd.read_parquet(data, engine=self.engine)
+                refs = {c: df[c].to_numpy() for c in df.columns}
+            except OSError:
+                refs = None
+            return refs
+        self.open_refs = open_refs
+    @staticmethod
+    def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
+        """Make empty parquet reference set
+        First deletes the contents of the given directory, if it exists.
+        Parameters
+        ----------
+        root: str
+            Directory to contain the output; will be created
+        storage_options: dict | None
+            For making the filesystem to use for writing is fs is None
+        fs: FileSystem | None
+            Filesystem for writing
+        record_size: int
+            Number of references per parquet file
+        kwargs: passed to __init__
+        Returns
+        -------
+        LazyReferenceMapper instance
+        """
+        met = {"metadata": {}, "record_size": record_size}
+        if fs is None:
+            fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
+        if fs.exists(root):
+            fs.rm(root, recursive=True)
+        fs.makedirs(root, exist_ok=True)
+        fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
+        return LazyReferenceMapper(root, fs, **kwargs)
+    @lru_cache()
+    def listdir(self):
+        """List top-level directories"""
+        dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
+        return set(dirs)
+    def ls(self, path="", detail=True):
+        """Shortcut file listings"""
+        path = path.rstrip("/")
+        pathdash = path + "/" if path else ""
+        dirnames = self.listdir()
+        dirs = [
+            d
+            for d in dirnames
+            if d.startswith(pathdash) and "/" not in d.lstrip(pathdash)
+        ]
+        if dirs:
+            others = {
+                f
+                for f in chain(
+                    [".zmetadata"],
+                    (name for name in self.zmetadata),
+                    (name for name in self._items),
+                )
+                if f.startswith(pathdash) and "/" not in f.lstrip(pathdash)
+            }
+            if detail is False:
+                others.update(dirs)
+                return sorted(others)
+            dirinfo = [{"name": name, "type": "directory", "size": 0} for name in dirs]
+            fileinfo = [
+                {
+                    "name": name,
+                    "type": "file",
+                    "size": len(
+                        json.dumps(self.zmetadata[name])
+                        if name in self.zmetadata
+                        else self._items[name]
+                    ),
+                }
+                for name in others
+            ]
+            return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
+        field = path
+        others = set(
+            [name for name in self.zmetadata if name.startswith(f"{path}/")]
+            + [name for name in self._items if name.startswith(f"{path}/")]
+        )
+        fileinfo = [
+            {
+                "name": name,
+                "type": "file",
+                "size": len(
+                    json.dumps(self.zmetadata[name])
+                    if name in self.zmetadata
+                    else self._items[name]
+                ),
+            }
+            for name in others
+        ]
+        keys = self._keys_in_field(field)
+        if detail is False:
+            return list(others) + list(keys)
+        recs = self._generate_all_records(field)
+        recinfo = [
+            {"name": name, "type": "file", "size": rec[-1]}
+            for name, rec in zip(keys, recs)
+            if rec[0]  # filters out path==None, deleted/missing
+        ]
+        return fileinfo + recinfo
+    def _load_one_key(self, key):
+        """Get the reference for one key
+        Returns bytes, one-element list or three-element list.
+        """
+        if key in self._items:
+            return self._items[key]
+        elif key in self.zmetadata:
+            return json.dumps(self.zmetadata[key]).encode()
+        elif "/" not in key or self._is_meta(key):
+            raise KeyError(key)
+        field, _ = key.rsplit("/", 1)
+        record, ri, chunk_size = self._key_to_record(key)
+        maybe = self._items.get((field, record), {}).get(ri, False)
+        if maybe is None:
+            # explicitly deleted
+            raise KeyError
+        elif maybe:
+            return maybe
+        elif chunk_size == 0:
+            return b""
+        # Chunk keys can be loaded from row group and cached in LRU cache
+        try:
+            refs = self.open_refs(field, record)
+        except (ValueError, TypeError, FileNotFoundError) as exc:
+            raise KeyError(key) from exc
+        columns = ["path", "offset", "size", "raw"]
+        selection = [refs[c][ri] if c in refs else None for c in columns]
+        raw = selection[-1]
+        if raw is not None:
+            return raw
+        if selection[0] is None:
+            raise KeyError("This reference does not exist or has been deleted")
+        if selection[1:3] == [0, 0]:
+            # URL only
+            return selection[:1]
+        # URL, offset, size
+        return selection[:3]
+    @lru_cache(4096)
+    def _key_to_record(self, key):
+        """Details needed to construct a reference for one key"""
+        field, chunk = key.rsplit("/", 1)
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            return 0, 0, 0
+        chunk_idx = [int(c) for c in chunk.split(".")]
+        chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
+        record = chunk_number // self.record_size
+        ri = chunk_number % self.record_size
+        return record, ri, len(chunk_sizes)
+    def _get_chunk_sizes(self, field):
+        """The number of chunks along each axis for a given field"""
+        if field not in self.chunk_sizes:
+            zarray = self.zmetadata[f"{field}/.zarray"]
+            size_ratio = [
+                math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
+            ]
+            self.chunk_sizes[field] = size_ratio or [1]
+        return self.chunk_sizes[field]
+    def _generate_record(self, field, record):
+        """The references for a given parquet file of a given field"""
+        refs = self.open_refs(field, record)
+        it = iter(zip(*refs.values()))
+        if len(refs) == 3:
+            # All urls
+            return (list(t) for t in it)
+        elif len(refs) == 1:
+            # All raws
+            return refs["raw"]
+        else:
+            # Mix of urls and raws
+            return (list(t[:3]) if not t[3] else t[3] for t in it)
+    def _generate_all_records(self, field):
+        """Load all the references within a field by iterating over the parquet files"""
+        nrec = 1
+        for ch in self._get_chunk_sizes(field):
+            nrec *= ch
+        nrec = math.ceil(nrec / self.record_size)
+        for record in range(nrec):
+            yield from self._generate_record(field, record)
+    def values(self):
+        return RefsValuesView(self)
+    def items(self):
+        return RefsItemsView(self)
+    def __hash__(self):
+        return id(self)
+    def __getitem__(self, key):
+        return self._load_one_key(key)
+    def __setitem__(self, key, value):
+        if "/" in key and not self._is_meta(key):
+            field, chunk = key.rsplit("/", 1)
+            record, i, _ = self._key_to_record(key)
+            subdict = self._items.setdefault((field, record), {})
+            subdict[i] = value
+            if len(subdict) == self.record_size:
+                self.write(field, record)
+        else:
+            # metadata or top-level
+            if hasattr(value, "to_bytes"):
+                val = value.to_bytes().decode()
+            elif isinstance(value, bytes):
+                val = value.decode()
+            else:
+                val = value
+            self._items[key] = val
+            new_value = json.loads(val)
+            self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
+    @staticmethod
+    def _is_meta(key):
+        return key.startswith(".z") or "/.z" in key
+    def __delitem__(self, key):
+        if key in self._items:
+            del self._items[key]
+        elif key in self.zmetadata:
+            del self.zmetadata[key]
+        else:
+            if "/" in key and not self._is_meta(key):
+                field, _ = key.rsplit("/", 1)
+                record, i, _ = self._key_to_record(key)
+                subdict = self._items.setdefault((field, record), {})
+                subdict[i] = None
+                if len(subdict) == self.record_size:
+                    self.write(field, record)
+            else:
+                # metadata or top-level
+                self._items[key] = None
+    def write(self, field, record, base_url=None, storage_options=None):
+        # extra requirements if writing
+        import kerchunk.df
+        import numpy as np
+        import pandas as pd
+        partition = self._items[(field, record)]
+        original = False
+        if len(partition) < self.record_size:
+            try:
+                original = self.open_refs(field, record)
+            except OSError:
+                pass
+        if original:
+            paths = original["path"]
+            offsets = original["offset"]
+            sizes = original["size"]
+            raws = original["raw"]
+        else:
+            paths = np.full(self.record_size, np.nan, dtype="O")
+            offsets = np.zeros(self.record_size, dtype="int64")
+            sizes = np.zeros(self.record_size, dtype="int64")
+            raws = np.full(self.record_size, np.nan, dtype="O")
+        for j, data in partition.items():
+            if isinstance(data, list):
+                if (
+                    str(paths.dtype) == "category"
+                    and data[0] not in paths.dtype.categories
+                ):
+                    paths = paths.add_categories(data[0])
+                paths[j] = data[0]
+                if len(data) > 1:
+                    offsets[j] = data[1]
+                    sizes[j] = data[2]
+            elif data is None:
+                # delete
+                paths[j] = None
+                offsets[j] = 0
+                sizes[j] = 0
+                raws[j] = None
+            else:
+                # this is the only call into kerchunk, could remove
+                raws[j] = kerchunk.df._proc_raw(data)
+        # TODO: only save needed columns
+        df = pd.DataFrame(
+            {
+                "path": paths,
+                "offset": offsets,
+                "size": sizes,
+                "raw": raws,
+            },
+            copy=False,
+        )
+        if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
+            df["path"] = df["path"].astype("category")
+        object_encoding = {"raw": "bytes", "path": "utf8"}
+        has_nulls = ["path", "raw"]
+        fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
+        self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
+        if self.engine == "pyarrow":
+            df_backend_kwargs = {"write_statistics": False}
+        elif self.engine == "fastparquet":
+            df_backend_kwargs = {
+                "stats": False,
+                "object_encoding": object_encoding,
+                "has_nulls": has_nulls,
+            }
+        else:
+            raise NotImplementedError(f"{self.engine} not supported")
+        df.to_parquet(
+            fn,
+            engine=self.engine,
+            storage_options=storage_options
+            or getattr(self.fs, "storage_options", None),
+            compression="zstd",
+            index=False,
+            **df_backend_kwargs,
+        )
+        partition.clear()
+        self._items.pop((field, record))
+    def flush(self, base_url=None, storage_options=None):
+        """Output any modified or deleted keys
+        Parameters
+        ----------
+        base_url: str
+            Location of the output
+        """
+        # write what we have so far and clear sub chunks
+        for thing in list(self._items):
+            if isinstance(thing, tuple):
+                field, record = thing
+                self.write(
+                    field,
+                    record,
+                    base_url=base_url,
+                    storage_options=storage_options,
+                )
+        # gather .zmetadata from self._items and write that too
+        for k in list(self._items):
+            if k != ".zmetadata" and ".z" in k:
+                self.zmetadata[k] = json.loads(self._items.pop(k))
+        met = {"metadata": self.zmetadata, "record_size": self.record_size}
+        self._items.clear()
+        self._items[".zmetadata"] = json.dumps(met).encode()
+        self.fs.pipe(
+            "/".join([base_url or self.out_root, ".zmetadata"]),
+            self._items[".zmetadata"],
+        )
+        # TODO: only clear those that we wrote to?
+        self.open_refs.cache_clear()
+    def __len__(self):
+        # Caveat: This counts expected references, not actual - but is fast
+        count = 0
+        for field in self.listdir():
+            if field.startswith("."):
+                count += 1
+            else:
+                count += math.prod(self._get_chunk_sizes(field))
+        count += len(self.zmetadata)  # all metadata keys
+        # any other files not in reference partitions
+        count += sum(1 for _ in self._items if not isinstance(_, tuple))
+        return count
+    def __iter__(self):
+        # Caveat: returns only existing keys, so the number of these does not
+        #  match len(self)
+        metas = set(self.zmetadata)
+        metas.update(self._items)
+        for bit in metas:
+            if isinstance(bit, str):
+                yield bit
+        for field in self.listdir():
+            for k in self._keys_in_field(field):
+                if k in self:
+                    yield k
+    def __contains__(self, item):
+        try:
+            self._load_one_key(item)
+            return True
+        except KeyError:
+            return False
+    def _keys_in_field(self, field):
+        """List key names in given field
+        Produces strings like "field/x.y" appropriate from the chunking of the array
+        """
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            yield field + "/0"
+            return
+        inds = itertools.product(*(range(i) for i in chunk_sizes))
+        for ind in inds:
+            yield field + "/" + ".".join([str(c) for c in ind])
+class ReferenceFileSystem(AsyncFileSystem):
+    """View byte ranges of some other file as a file system
+    Initial version: single file system target, which must support
+    async, and must allow start and end args in _cat_file. Later versions
+    may allow multiple arbitrary URLs for the targets.
+    This FileSystem is read-only. It is designed to be used with async
+    targets (for now). We do not get original file details from the target FS.
+    Configuration is by passing a dict of references at init, or a URL to
+    a JSON file containing the same; this dict
+    can also contain concrete data for some set of paths.
+    Reference dict format:
+    {path0: bytes_data, path1: (target_url, offset, size)}
+    https://github.com/fsspec/kerchunk/blob/main/README.md
+    """
+    protocol = "reference"
+    cachable = False
+    def __init__(
+        self,
+        fo,
+        target=None,
+        ref_storage_args=None,
+        target_protocol=None,
+        target_options=None,
+        remote_protocol=None,
+        remote_options=None,
+        fs=None,
+        template_overrides=None,
+        simple_templates=True,
+        max_gap=64_000,
+        max_block=256_000_000,
+        cache_size=128,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo : dict or str
+            The set of references to use for this instance, with a structure as above.
+            If str referencing a JSON file, will use fsspec.open, in conjunction
+            with target_options and target_protocol to open and parse JSON at this
+            location. If a directory, then assume references are a set of parquet
+            files to be loaded lazily.
+        target : str
+            For any references having target_url as None, this is the default file
+            target to use
+        ref_storage_args : dict
+            If references is a str, use these kwargs for loading the JSON file.
+            Deprecated: use target_options instead.
+        target_protocol : str
+            Used for loading the reference file, if it is a path. If None, protocol
+            will be derived from the given path
+        target_options : dict
+            Extra FS options for loading the reference file ``fo``, if given as a path
+        remote_protocol : str
+            The protocol of the filesystem on which the references will be evaluated
+            (unless fs is provided). If not given, will be derived from the first
+            URL that has a protocol in the templates or in the references, in that
+            order.
+        remote_options : dict
+            kwargs to go with remote_protocol
+        fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
+            Directly provide a file system(s):
+                - a single filesystem instance
+                - a dict of protocol:filesystem, where each value is either a filesystem
+                  instance, or a dict of kwargs that can be used to create in
+                  instance for the given protocol
+            If this is given, remote_options and remote_protocol are ignored.
+        template_overrides : dict
+            Swap out any templates in the references file with these - useful for
+            testing.
+        simple_templates: bool
+            Whether templates can be processed with simple replace (True) or if
+            jinja  is needed (False, much slower). All reference sets produced by
+            ``kerchunk`` are simple in this sense, but the spec allows for complex.
+        max_gap, max_block: int
+            For merging multiple concurrent requests to the same remote file.
+            Neighboring byte ranges will only be merged when their
+            inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
+            to only merge when it requires no extra bytes. Pass a negative
+            number to disable merging, appropriate for local target files.
+            Neighboring byte ranges will only be merged when the size of
+            the aggregated range is <= ``max_block``. Default is 256MB.
+        cache_size : int
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+            Only used for lazily loaded references.
+        kwargs : passed to parent class
+        """
+        super().__init__(**kwargs)
+        self.target = target
+        self.template_overrides = template_overrides
+        self.simple_templates = simple_templates
+        self.templates = {}
+        self.fss = {}
+        self._dircache = {}
+        self.max_gap = max_gap
+        self.max_block = max_block
+        if isinstance(fo, str):
+            dic = dict(
+                **(ref_storage_args or target_options or {}), protocol=target_protocol
+            )
+            ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
+            if ref_fs.isfile(fo2):
+                # text JSON
+                with fsspec.open(fo, "rb", **dic) as f:
+                    logger.info("Read reference from URL %s", fo)
+                    text = json.load(f)
+                self._process_references(text, template_overrides)
+            else:
+                # Lazy parquet refs
+                logger.info("Open lazy reference dict from URL %s", fo)
+                self.references = LazyReferenceMapper(
+                    fo2,
+                    fs=ref_fs,
+                    cache_size=cache_size,
+                )
+        else:
+            # dictionaries
+            self._process_references(fo, template_overrides)
+        if isinstance(fs, dict):
+            self.fss = {
+                k: (
+                    fsspec.filesystem(k.split(":", 1)[0], **opts)
+                    if isinstance(opts, dict)
+                    else opts
+                )
+                for k, opts in fs.items()
+            }
+            if None not in self.fss:
+                self.fss[None] = filesystem("file")
+            return
+        if fs is not None:
+            # single remote FS
+            remote_protocol = (
+                fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
+            )
+            self.fss[remote_protocol] = fs
+        if remote_protocol is None:
+            # get single protocol from any templates
+            for ref in self.templates.values():
+                if callable(ref):
+                    ref = ref()
+                protocol, _ = fsspec.core.split_protocol(ref)
+                if protocol and protocol not in self.fss:
+                    fs = filesystem(protocol, **(remote_options or {}))
+                    self.fss[protocol] = fs
+        if remote_protocol is None:
+            # get single protocol from references
+            # TODO: warning here, since this can be very expensive?
+            for ref in self.references.values():
+                if callable(ref):
+                    ref = ref()
+                if isinstance(ref, list) and ref[0]:
+                    protocol, _ = fsspec.core.split_protocol(ref[0])
+                    if protocol not in self.fss:
+                        fs = filesystem(protocol, **(remote_options or {}))
+                        self.fss[protocol] = fs
+                        # only use first remote URL
+                        break
+        if remote_protocol and remote_protocol not in self.fss:
+            fs = filesystem(remote_protocol, **(remote_options or {}))
+            self.fss[remote_protocol] = fs
+        self.fss[None] = fs or filesystem("file")  # default one
+        # Wrap any non-async filesystems to ensure async methods are available below
+        for k, f in self.fss.items():
+            if not f.async_impl:
+                self.fss[k] = AsyncFileSystemWrapper(f)
+            elif self.asynchronous ^ f.asynchronous:
+                raise ValueError(
+                    "Reference-FS's target filesystem must have same value"
+                    "of asynchronous"
+                )
+    def _cat_common(self, path, start=None, end=None):
+        path = self._strip_protocol(path)
+        logger.debug(f"cat: {path}")
+        try:
+            part = self.references[path]
+        except KeyError as exc:
+            raise FileNotFoundError(path) from exc
+        if isinstance(part, str):
+            part = part.encode()
+        if hasattr(part, "to_bytes"):
+            part = part.to_bytes()
+        if isinstance(part, bytes):
+            logger.debug(f"Reference: {path}, type bytes")
+            if part.startswith(b"base64:"):
+                part = base64.b64decode(part[7:])
+            return part, None, None
+        if len(part) == 1:
+            logger.debug(f"Reference: {path}, whole file => {part}")
+            url = part[0]
+            start1, end1 = start, end
+        else:
+            url, start0, size = part
+            logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
+            end0 = start0 + size
+            if start is not None:
+                if start >= 0:
+                    start1 = start0 + start
+                else:
+                    start1 = end0 + start
+            else:
+                start1 = start0
+            if end is not None:
+                if end >= 0:
+                    end1 = start0 + end
+                else:
+                    end1 = end0 + end
+            else:
+                end1 = end0
+        if url is None:
+            url = self.target
+        return url, start1, end1
+    async def _cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            return await self.fss[protocol]._cat_file(
+                part_or_url, start=start0, end=end0
+            )
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+    def pipe_file(self, path, value, **_):
+        """Temporarily add binary data or reference as a file"""
+        self.references[path] = value
+    async def _get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = await self._cat_file(rpath)
+        with open(lpath, "wb") as f:
+            f.write(data)
+    def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = self.cat_file(rpath, **kwargs)
+        callback.set_size(len(data))
+        if isfilelike(lpath):
+            lpath.write(data)
+        else:
+            with open(lpath, "wb") as f:
+                f.write(data)
+        callback.absolute_update(len(data))
+    def get(self, rpath, lpath, recursive=False, **kwargs):
+        if recursive:
+            # trigger directory build
+            self.ls("")
+        rpath = self.expand_path(rpath, recursive=recursive)
+        fs = fsspec.filesystem("file", auto_mkdir=True)
+        targets = other_paths(rpath, lpath)
+        if recursive:
+            data = self.cat([r for r in rpath if not self.isdir(r)])
+        else:
+            data = self.cat(rpath)
+        for remote, local in zip(rpath, targets):
+            if remote in data:
+                fs.pipe_file(local, data[remote])
+    def cat(self, path, recursive=False, on_error="raise", **kwargs):
+        if isinstance(path, str) and recursive:
+            raise NotImplementedError
+        if isinstance(path, list) and (recursive or any("*" in p for p in path)):
+            raise NotImplementedError
+        # TODO: if references is lazy, pre-fetch all paths in batch before access
+        proto_dict = _protocol_groups(path, self.references)
+        out = {}
+        for proto, paths in proto_dict.items():
+            fs = self.fss[proto]
+            urls, starts, ends, valid_paths = [], [], [], []
+            for p in paths:
+                # find references or label not-found. Early exit if any not
+                # found and on_error is "raise"
+                try:
+                    u, s, e = self._cat_common(p)
+                    if not isinstance(u, (bytes, str)):
+                        # nan/None from parquet
+                        continue
+                except FileNotFoundError as err:
+                    if on_error == "raise":
+                        raise
+                    if on_error != "omit":
+                        out[p] = err
+                else:
+                    urls.append(u)
+                    starts.append(s)
+                    ends.append(e)
+                    valid_paths.append(p)
+            # process references into form for merging
+            urls2 = []
+            starts2 = []
+            ends2 = []
+            paths2 = []
+            whole_files = set()
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if isinstance(u, bytes):
+                    # data
+                    out[p] = u
+                elif s is None:
+                    # whole file - limits are None, None, but no further
+                    # entries take for this file
+                    whole_files.add(u)
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                # second run to account for files that are to be loaded whole
+                if s is not None and u not in whole_files:
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+            # merge and fetch consolidated ranges
+            new_paths, new_starts, new_ends = merge_offset_ranges(
+                list(urls2),
+                list(starts2),
+                list(ends2),
+                sort=True,
+                max_gap=self.max_gap,
+                max_block=self.max_block,
+            )
+            bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
+            # unbundle from merged bytes - simple approach
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if p in out:
+                    continue  # was bytes, already handled
+                for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
+                    if np == u and (ns is None or ne is None):
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s:e]
+                    elif np == u and s >= ns and e <= ne:
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s - ns : (e - ne) or None]
+        for k, v in out.copy().items():
+            # these were valid references, but fetch failed, so transform exc
+            if isinstance(v, Exception) and k in self.references:
+                ex = out[k]
+                new_ex = ReferenceNotReachable(k, self.references[k])
+                new_ex.__cause__ = ex
+                if on_error == "raise":
+                    raise new_ex
+                elif on_error != "omit":
+                    out[k] = new_ex
+        if len(out) == 1 and isinstance(path, str) and "*" not in path:
+            return _first(out)
+        return out
+    def _process_references(self, references, template_overrides=None):
+        vers = references.get("version", None)
+        if vers is None:
+            self._process_references0(references)
+        elif vers == 1:
+            self._process_references1(references, template_overrides=template_overrides)
+        else:
+            raise ValueError(f"Unknown reference spec version: {vers}")
+        # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
+        #  can replace with programmatic. Is it even needed for mapper interface?
+    def _process_references0(self, references):
+        """Make reference dict for Spec Version 0"""
+        if isinstance(references, dict):
+            # do not do this for lazy/parquet backend, which will not make dicts,
+            # but must remain writable in the original object
+            references = {
+                key: json.dumps(val) if isinstance(val, dict) else val
+                for key, val in references.items()
+            }
+        self.references = references
+    def _process_references1(self, references, template_overrides=None):
+        if not self.simple_templates or self.templates:
+            import jinja2
+        self.references = {}
+        self._process_templates(references.get("templates", {}))
+        @lru_cache(1000)
+        def _render_jinja(u):
+            return jinja2.Template(u).render(**self.templates)
+        for k, v in references.get("refs", {}).items():
+            if isinstance(v, str):
+                if v.startswith("base64:"):
+                    self.references[k] = base64.b64decode(v[7:])
+                self.references[k] = v
+            elif isinstance(v, dict):
+                self.references[k] = json.dumps(v)
+            elif self.templates:
+                u = v[0]
+                if "{{" in u:
+                    if self.simple_templates:
+                        u = (
+                            u.replace("{{", "{")
+                            .replace("}}", "}")
+                            .format(**self.templates)
+                        )
+                    else:
+                        u = _render_jinja(u)
+                self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
+            else:
+                self.references[k] = v
+        self.references.update(self._process_gen(references.get("gen", [])))
+    def _process_templates(self, tmp):
+        self.templates = {}
+        if self.template_overrides is not None:
+            tmp.update(self.template_overrides)
+        for k, v in tmp.items():
+            if "{{" in v:
+                import jinja2
+                self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
+                    temp
+                ).render(**kwargs)
+            else:
+                self.templates[k] = v
+    def _process_gen(self, gens):
+        out = {}
+        for gen in gens:
+            dimension = {
+                k: (
+                    v
+                    if isinstance(v, list)
+                    else range(v.get("start", 0), v["stop"], v.get("step", 1))
+                )
+                for k, v in gen["dimensions"].items()
+            }
+            products = (
+                dict(zip(dimension.keys(), values))
+                for values in itertools.product(*dimension.values())
+            )
+            for pr in products:
+                import jinja2
+                key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
+                url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
+                if ("offset" in gen) and ("length" in gen):
+                    offset = int(
+                        jinja2.Template(gen["offset"]).render(**pr, **self.templates)
+                    )
+                    length = int(
+                        jinja2.Template(gen["length"]).render(**pr, **self.templates)
+                    )
+                    out[key] = [url, offset, length]
+                elif ("offset" in gen) ^ ("length" in gen):
+                    raise ValueError(
+                        "Both 'offset' and 'length' are required for a "
+                        "reference generator entry if either is provided."
+                    )
+                else:
+                    out[key] = [url]
+        return out
+    def _dircache_from_items(self):
+        self.dircache = {"": []}
+        it = self.references.items()
+        for path, part in it:
+            if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
+                size = len(part)
+            elif len(part) == 1:
+                size = None
+            else:
+                _, _, size = part
+            par = path.rsplit("/", 1)[0] if "/" in path else ""
+            par0 = par
+            subdirs = [par0]
+            while par0 and par0 not in self.dircache:
+                # collect parent directories
+                par0 = self._parent(par0)
+                subdirs.append(par0)
+            subdirs.reverse()
+            for parent, child in zip(subdirs, subdirs[1:]):
+                # register newly discovered directories
+                assert child not in self.dircache
+                assert parent in self.dircache
+                self.dircache[parent].append(
+                    {"name": child, "type": "directory", "size": 0}
+                )
+                self.dircache[child] = []
+            self.dircache[par].append({"name": path, "type": "file", "size": size})
+    def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path)
+        # This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
+        # That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
+        if isinstance(part_or_url, bytes):
+            return io.BytesIO(part_or_url[start0:end0])
+        protocol, _ = split_protocol(part_or_url)
+        if start0 is None and end0 is None:
+            return self.fss[protocol]._open(
+                part_or_url,
+                mode,
+                block_size=block_size,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        return ReferenceFile(
+            self,
+            path,
+            mode,
+            block_size=block_size,
+            cache_options=cache_options,
+            **kwargs,
+        )
+    def ls(self, path, detail=True, **kwargs):
+        logger.debug("list %s", path)
+        path = self._strip_protocol(path)
+        if isinstance(self.references, LazyReferenceMapper):
+            try:
+                return self.references.ls(path, detail)
+            except KeyError:
+                pass
+            raise FileNotFoundError(f"'{path}' is not a known key")
+        if not self.dircache:
+            self._dircache_from_items()
+        out = self._ls_from_cache(path)
+        if out is None:
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return [o["name"] for o in out]
+    def exists(self, path, **kwargs):  # overwrite auto-sync version
+        return self.isdir(path) or self.isfile(path)
+    def isdir(self, path):  # overwrite auto-sync version
+        if self.dircache:
+            return path in self.dircache
+        elif isinstance(self.references, LazyReferenceMapper):
+            return path in self.references.listdir()
+        else:
+            # this may be faster than building dircache for single calls, but
+            # by looping will be slow for many calls; could cache it?
+            return any(_.startswith(f"{path}/") for _ in self.references)
+    def isfile(self, path):  # overwrite auto-sync version
+        return path in self.references
+    async def _ls(self, path, detail=True, **kwargs):  # calls fast sync code
+        return self.ls(path, detail, **kwargs)
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        if withdirs:
+            return super().find(
+                path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+            )
+        if path:
+            path = self._strip_protocol(path)
+            r = sorted(k for k in self.references if k.startswith(path))
+        else:
+            r = sorted(self.references)
+        if detail:
+            if not self.dircache:
+                self._dircache_from_items()
+            return {k: self._ls_from_cache(k)[0] for k in r}
+        else:
+            return r
+    def info(self, path, **kwargs):
+        out = self.references.get(path)
+        if out is not None:
+            if isinstance(out, (str, bytes)):
+                # decode base64 here
+                return {"name": path, "type": "file", "size": len(out)}
+            elif len(out) > 1:
+                return {"name": path, "type": "file", "size": out[2]}
+            else:
+                out0 = [{"name": path, "type": "file", "size": None}]
+        else:
+            out = self.ls(path, True)
+            out0 = [o for o in out if o["name"] == path]
+            if not out0:
+                return {"name": path, "type": "directory", "size": 0}
+        if out0[0]["size"] is None:
+            # if this is a whole remote file, update size using remote FS
+            prot, _ = split_protocol(self.references[path][0])
+            out0[0]["size"] = self.fss[prot].size(self.references[path][0])
+        return out0[0]
+    async def _info(self, path, **kwargs):  # calls fast sync code
+        return self.info(path)
+    async def _rm_file(self, path, **kwargs):
+        self.references.pop(
+            path, None
+        )  # ignores FileNotFound, just as well for directories
+        self.dircache.clear()  # this is a bit heavy handed
+    async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
+        if mode == "create" and self.exists(path):
+            raise FileExistsError
+        # can be str or bytes
+        self.references[path] = data
+        self.dircache.clear()  # this is a bit heavy handed
+    async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
+        # puts binary
+        if mode == "create" and self.exists(rpath):
+            raise FileExistsError
+        with open(lpath, "rb") as f:
+            self.references[rpath] = f.read()
+        self.dircache.clear()  # this is a bit heavy handed
+    def save_json(self, url, **storage_options):
+        """Write modified references into new location"""
+        out = {}
+        for k, v in self.references.items():
+            if isinstance(v, bytes):
+                try:
+                    out[k] = v.decode("ascii")
+                except UnicodeDecodeError:
+                    out[k] = (b"base64:" + base64.b64encode(v)).decode()
+            else:
+                out[k] = v
+        with fsspec.open(url, "wb", **storage_options) as f:
+            f.write(json.dumps({"version": 1, "refs": out}).encode())
+class ReferenceFile(AbstractBufferedFile):
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            size=size,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+        part_or_url, self.start, self.end = self.fs._cat_common(self.path)
+        protocol, _ = split_protocol(part_or_url)
+        self.src_fs = self.fs.fss[protocol]
+        self.src_path = part_or_url
+        self._f = None
+    @property
+    def f(self):
+        if self._f is None or self._f.closed:
+            self._f = self.src_fs._open(
+                self.src_path,
+                mode=self.mode,
+                block_size=self.blocksize,
+                autocommit=self.autocommit,
+                cache_type="none",
+                **self.kwargs,
+            )
+        return self._f
+    def close(self):
+        if self._f is not None:
+            self._f.close()
+        return super().close()
+    def _fetch_range(self, start, end):
+        start = start + self.start
+        end = min(end + self.start, self.end)
+        self.f.seek(start)
+        return self.f.read(end - start)

.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import datetime
+import logging
+import os
+import types
+import uuid
+from stat import S_ISDIR, S_ISLNK
+import paramiko
+from .. import AbstractFileSystem
+from ..utils import infer_storage_options
+logger = logging.getLogger("fsspec.sftp")
+class SFTPFileSystem(AbstractFileSystem):
+    """Files over SFTP/SSH
+    Peer-to-peer filesystem over SSH using paramiko.
+    Note: if using this with the ``open`` or ``open_files``, with full URLs,
+    there is no way to tell if a path is relative, so all paths are assumed
+    to be absolute.
+    """
+    protocol = "sftp", "ssh"
+    def __init__(self, host, **ssh_kwargs):
+        """
+        Parameters
+        ----------
+        host: str
+            Hostname or IP as a string
+        temppath: str
+            Location on the server to put files, when within a transaction
+        ssh_kwargs: dict
+            Parameters passed on to connection. See details in
+            https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
+            May include port, username, password...
+        """
+        if self._cached:
+            return
+        super().__init__(**ssh_kwargs)
+        self.temppath = ssh_kwargs.pop("temppath", "/tmp")  # remote temp directory
+        self.host = host
+        self.ssh_kwargs = ssh_kwargs
+        self._connect()
+    def _connect(self):
+        logger.debug("Connecting to SFTP server %s", self.host)
+        self.client = paramiko.SSHClient()
+        self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        self.client.connect(self.host, **self.ssh_kwargs)
+        self.ftp = self.client.open_sftp()
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+    def mkdir(self, path, create_parents=True, mode=511):
+        logger.debug("Creating folder %s", path)
+        if self.exists(path):
+            raise FileExistsError(f"File exists: {path}")
+        if create_parents:
+            self.makedirs(path)
+        else:
+            self.ftp.mkdir(path, mode)
+    def makedirs(self, path, exist_ok=False, mode=511):
+        if self.exists(path) and not exist_ok:
+            raise FileExistsError(f"File exists: {path}")
+        parts = path.split("/")
+        new_path = "/" if path[:1] == "/" else ""
+        for part in parts:
+            if part:
+                new_path = f"{new_path}/{part}" if new_path else part
+                if not self.exists(new_path):
+                    self.ftp.mkdir(new_path, mode)
+    def rmdir(self, path):
+        logger.debug("Removing folder %s", path)
+        self.ftp.rmdir(path)
+    def info(self, path):
+        stat = self._decode_stat(self.ftp.stat(path))
+        stat["name"] = path
+        return stat
+    @staticmethod
+    def _decode_stat(stat, parent_path=None):
+        if S_ISDIR(stat.st_mode):
+            t = "directory"
+        elif S_ISLNK(stat.st_mode):
+            t = "link"
+        else:
+            t = "file"
+        out = {
+            "name": "",
+            "size": stat.st_size,
+            "type": t,
+            "uid": stat.st_uid,
+            "gid": stat.st_gid,
+            "time": datetime.datetime.fromtimestamp(
+                stat.st_atime, tz=datetime.timezone.utc
+            ),
+            "mtime": datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ),
+        }
+        if parent_path:
+            out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
+        return out
+    def ls(self, path, detail=False):
+        logger.debug("Listing folder %s", path)
+        stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
+        if detail:
+            return stats
+        else:
+            paths = [stat["name"] for stat in stats]
+            return sorted(paths)
+    def put(self, lpath, rpath, callback=None, **kwargs):
+        logger.debug("Put file %s into %s", lpath, rpath)
+        self.ftp.put(lpath, rpath)
+    def get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            os.makedirs(lpath, exist_ok=True)
+        else:
+            self.ftp.get(self._strip_protocol(rpath), lpath)
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        """
+        block_size: int or None
+            If 0, no buffering, if 1, line buffering, if >1, buffer that many
+            bytes, if None use default from paramiko.
+        """
+        logger.debug("Opening file %s", path)
+        if kwargs.get("autocommit", True) is False:
+            # writes to temporary file, move on commit
+            path2 = "/".join([self.temppath, str(uuid.uuid4())])
+            f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
+            f.temppath = path2
+            f.targetpath = path
+            f.fs = self
+            f.commit = types.MethodType(commit_a_file, f)
+            f.discard = types.MethodType(discard_a_file, f)
+        else:
+            f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
+        return f
+    def _rm(self, path):
+        if self.isdir(path):
+            self.ftp.rmdir(path)
+        else:
+            self.ftp.remove(path)
+    def mv(self, old, new):
+        logger.debug("Renaming %s into %s", old, new)
+        self.ftp.posix_rename(old, new)
+def commit_a_file(self):
+    self.fs.mv(self.temppath, self.targetpath)
+def discard_a_file(self):
+    self.fs._rm(self.temppath)

.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import logging
+import tarfile
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.compression import compr
+from fsspec.utils import infer_compression
+typemap = {b"0": "file", b"5": "directory"}
+logger = logging.getLogger("tar")
+class TarFileSystem(AbstractArchiveFileSystem):
+    """Compressed Tar archives as a file-system (read-only)
+    Supports the following formats:
+    tar.gz, tar.bz2, tar.xz
+    """
+    root_marker = ""
+    protocol = "tar"
+    cachable = False
+    def __init__(
+        self,
+        fo="",
+        index_store=None,
+        target_options=None,
+        target_protocol=None,
+        compression=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        target_options = target_options or {}
+        if isinstance(fo, str):
+            self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+            fo = self.of.open()  # keep the reference
+        # Try to infer compression.
+        if compression is None:
+            name = None
+            # Try different ways to get hold of the filename. `fo` might either
+            # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+            # `fsspec.AbstractFileSystem` instance.
+            try:
+                # Amended io.BufferedReader or similar.
+                # This uses a "protocol extension" where original filenames are
+                # propagated to archive-like filesystems in order to let them
+                # infer the right compression appropriately.
+                if hasattr(fo, "original"):
+                    name = fo.original
+                # fsspec.LocalFileOpener
+                elif hasattr(fo, "path"):
+                    name = fo.path
+                # io.BufferedReader
+                elif hasattr(fo, "name"):
+                    name = fo.name
+                # fsspec.AbstractFileSystem
+                elif hasattr(fo, "info"):
+                    name = fo.info()["name"]
+            except Exception as ex:
+                logger.warning(
+                    f"Unable to determine file name, not inferring compression: {ex}"
+                )
+            if name is not None:
+                compression = infer_compression(name)
+                logger.info(f"Inferred compression {compression} from file name {name}")
+        if compression is not None:
+            # TODO: tarfile already implements compression with modes like "'r:gz'",
+            #  but then would seek to offset in the file work?
+            fo = compr[compression](fo)
+        self._fo_ref = fo
+        self.fo = fo  # the whole instance is a context
+        self.tar = tarfile.TarFile(fileobj=self.fo)
+        self.dir_cache = None
+        self.index_store = index_store
+        self.index = None
+        self._index()
+    def _index(self):
+        # TODO: load and set saved index, if exists
+        out = {}
+        for ti in self.tar:
+            info = ti.get_info()
+            info["type"] = typemap.get(info["type"], "file")
+            name = ti.get_info()["name"].rstrip("/")
+            out[name] = (info, ti.offset_data)
+        self.index = out
+        # TODO: save index to self.index_store here, if set
+    def _get_dirs(self):
+        if self.dir_cache is not None:
+            return
+        # This enables ls to get directories as children as well as files
+        self.dir_cache = {
+            dirname: {"name": dirname, "size": 0, "type": "directory"}
+            for dirname in self._all_dirnames(self.tar.getnames())
+        }
+        for member in self.tar.getmembers():
+            info = member.get_info()
+            info["name"] = info["name"].rstrip("/")
+            info["type"] = typemap.get(info["type"], "file")
+            self.dir_cache[info["name"]] = info
+    def _open(self, path, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError("Read-only filesystem implementation")
+        details, offset = self.index[path]
+        if details["type"] != "file":
+            raise ValueError("Can only handle regular files")
+        return self.tar.extractfile(path)

.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
+import logging
+import os
+import secrets
+import shutil
+import tempfile
+import uuid
+from contextlib import suppress
+from urllib.parse import quote
+import requests
+from ..spec import AbstractBufferedFile, AbstractFileSystem
+from ..utils import infer_storage_options, tokenize
+logger = logging.getLogger("webhdfs")
+class WebHDFS(AbstractFileSystem):
+    """
+    Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
+    Four auth mechanisms are supported:
+    insecure: no auth is done, and the user is assumed to be whoever they
+        say they are (parameter ``user``), or a predefined value such as
+        "dr.who" if not given
+    spnego: when kerberos authentication is enabled, auth is negotiated by
+        requests_kerberos https://github.com/requests/requests-kerberos .
+        This establishes a session based on existing kinit login and/or
+        specified principal/password; parameters are passed with ``kerb_kwargs``
+    token: uses an existing Hadoop delegation token from another secured
+        service. Indeed, this client can also generate such tokens when
+        not insecure. Note that tokens expire, but can be renewed (by a
+        previously specified user) and may allow for proxying.
+    basic-auth: used when both parameter ``user`` and parameter ``password``
+        are provided.
+    """
+    tempdir = str(tempfile.gettempdir())
+    protocol = "webhdfs", "webHDFS"
+    def __init__(
+        self,
+        host,
+        port=50070,
+        kerberos=False,
+        token=None,
+        user=None,
+        password=None,
+        proxy_to=None,
+        kerb_kwargs=None,
+        data_proxy=None,
+        use_https=False,
+        session_cert=None,
+        session_verify=True,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        host: str
+            Name-node address
+        port: int
+            Port for webHDFS
+        kerberos: bool
+            Whether to authenticate with kerberos for this connection
+        token: str or None
+            If given, use this token on every call to authenticate. A user
+            and user-proxy may be encoded in the token and should not be also
+            given
+        user: str or None
+            If given, assert the user name to connect with
+        password: str or None
+            If given, assert the password to use for basic auth. If password
+            is provided, user must be provided also
+        proxy_to: str or None
+            If given, the user has the authority to proxy, and this value is
+            the user in who's name actions are taken
+        kerb_kwargs: dict
+            Any extra arguments for HTTPKerberosAuth, see
+            `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
+        data_proxy: dict, callable or None
+            If given, map data-node addresses. This can be necessary if the
+            HDFS cluster is behind a proxy, running on Docker or otherwise has
+            a mismatch between the host-names given by the name-node and the
+            address by which to refer to them from the client. If a dict,
+            maps host names ``host->data_proxy[host]``; if a callable, full
+            URLs are passed, and function must conform to
+            ``url->data_proxy(url)``.
+        use_https: bool
+            Whether to connect to the Name-node using HTTPS instead of HTTP
+        session_cert: str or Tuple[str, str] or None
+            Path to a certificate file, or tuple of (cert, key) files to use
+            for the requests.Session
+        session_verify: str, bool or None
+            Path to a certificate file to use for verifying the requests.Session.
+        kwargs
+        """
+        if self._cached:
+            return
+        super().__init__(**kwargs)
+        self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
+        self.kerb = kerberos
+        self.kerb_kwargs = kerb_kwargs or {}
+        self.pars = {}
+        self.proxy = data_proxy or {}
+        if token is not None:
+            if user is not None or proxy_to is not None:
+                raise ValueError(
+                    "If passing a delegation token, must not set "
+                    "user or proxy_to, as these are encoded in the"
+                    " token"
+                )
+            self.pars["delegation"] = token
+        self.user = user
+        self.password = password
+        if password is not None:
+            if user is None:
+                raise ValueError(
+                    "If passing a password, the user must also be"
+                    "set in order to set up the basic-auth"
+                )
+        else:
+            if user is not None:
+                self.pars["user.name"] = user
+        if proxy_to is not None:
+            self.pars["doas"] = proxy_to
+        if kerberos and user is not None:
+            raise ValueError(
+                "If using Kerberos auth, do not specify the "
+                "user, this is handled by kinit."
+            )
+        self.session_cert = session_cert
+        self.session_verify = session_verify
+        self._connect()
+        self._fsid = f"webhdfs_{tokenize(host, port)}"
+    @property
+    def fsid(self):
+        return self._fsid
+    def _connect(self):
+        self.session = requests.Session()
+        if self.session_cert:
+            self.session.cert = self.session_cert
+        self.session.verify = self.session_verify
+        if self.kerb:
+            from requests_kerberos import HTTPKerberosAuth
+            self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
+        if self.user is not None and self.password is not None:
+            from requests.auth import HTTPBasicAuth
+            self.session.auth = HTTPBasicAuth(self.user, self.password)
+    def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
+        path = self._strip_protocol(path) if path is not None else ""
+        url = self._apply_proxy(self.url + quote(path, safe="/="))
+        args = kwargs.copy()
+        args.update(self.pars)
+        args["op"] = op.upper()
+        logger.debug("sending %s with %s", url, method)
+        out = self.session.request(
+            method=method.upper(),
+            url=url,
+            params=args,
+            data=data,
+            allow_redirects=redirect,
+        )
+        if out.status_code in [400, 401, 403, 404, 500]:
+            try:
+                err = out.json()
+                msg = err["RemoteException"]["message"]
+                exp = err["RemoteException"]["exception"]
+            except (ValueError, KeyError):
+                pass
+            else:
+                if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
+                    raise ValueError(msg)
+                elif exp in ["SecurityException", "AccessControlException"]:
+                    raise PermissionError(msg)
+                elif exp in ["FileNotFoundException"]:
+                    raise FileNotFoundError(msg)
+                else:
+                    raise RuntimeError(msg)
+        out.raise_for_status()
+        return out
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        replication=None,
+        permissions=None,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            File location
+        mode: str
+            'rb', 'wb', etc.
+        block_size: int
+            Client buffer size for read-ahead or write buffer
+        autocommit: bool
+            If False, writes to temporary file that only gets put in final
+            location upon commit
+        replication: int
+            Number of copies of file on the cluster, write mode only
+        permissions: str or int
+            posix permissions, write mode only
+        kwargs
+        Returns
+        -------
+        WebHDFile instance
+        """
+        block_size = block_size or self.blocksize
+        return WebHDFile(
+            self,
+            path,
+            mode=mode,
+            block_size=block_size,
+            tempdir=self.tempdir,
+            autocommit=autocommit,
+            replication=replication,
+            permissions=permissions,
+        )
+    @staticmethod
+    def _process_info(info):
+        info["type"] = info["type"].lower()
+        info["size"] = info["length"]
+        return info
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        if "username" in out:
+            out["user"] = out.pop("username")
+        return out
+    def info(self, path):
+        out = self._call("GETFILESTATUS", path=path)
+        info = out.json()["FileStatus"]
+        info["name"] = path
+        return self._process_info(info)
+    def ls(self, path, detail=False):
+        out = self._call("LISTSTATUS", path=path)
+        infos = out.json()["FileStatuses"]["FileStatus"]
+        for info in infos:
+            self._process_info(info)
+            info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
+        if detail:
+            return sorted(infos, key=lambda i: i["name"])
+        else:
+            return sorted(info["name"] for info in infos)
+    def content_summary(self, path):
+        """Total numbers of files, directories and bytes under path"""
+        out = self._call("GETCONTENTSUMMARY", path=path)
+        return out.json()["ContentSummary"]
+    def ukey(self, path):
+        """Checksum info of file, giving method and result"""
+        out = self._call("GETFILECHECKSUM", path=path, redirect=False)
+        if "Location" in out.headers:
+            location = self._apply_proxy(out.headers["Location"])
+            out2 = self.session.get(location)
+            out2.raise_for_status()
+            return out2.json()["FileChecksum"]
+        else:
+            out.raise_for_status()
+            return out.json()["FileChecksum"]
+    def home_directory(self):
+        """Get user's home directory"""
+        out = self._call("GETHOMEDIRECTORY")
+        return out.json()["Path"]
+    def get_delegation_token(self, renewer=None):
+        """Retrieve token which can give the same authority to other uses
+        Parameters
+        ----------
+        renewer: str or None
+            User who may use this token; if None, will be current user
+        """
+        if renewer:
+            out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
+        else:
+            out = self._call("GETDELEGATIONTOKEN")
+        t = out.json()["Token"]
+        if t is None:
+            raise ValueError("No token available for this user/security context")
+        return t["urlString"]
+    def renew_delegation_token(self, token):
+        """Make token live longer. Returns new expiry time"""
+        out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
+        return out.json()["long"]
+    def cancel_delegation_token(self, token):
+        """Stop the token from being useful"""
+        self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
+    def chmod(self, path, mod):
+        """Set the permission at path
+        Parameters
+        ----------
+        path: str
+            location to set (file or directory)
+        mod: str or int
+            posix epresentation or permission, give as oct string, e.g, '777'
+            or 0o777
+        """
+        self._call("SETPERMISSION", method="put", path=path, permission=mod)
+    def chown(self, path, owner=None, group=None):
+        """Change owning user and/or group"""
+        kwargs = {}
+        if owner is not None:
+            kwargs["owner"] = owner
+        if group is not None:
+            kwargs["group"] = group
+        self._call("SETOWNER", method="put", path=path, **kwargs)
+    def set_replication(self, path, replication):
+        """
+        Set file replication factor
+        Parameters
+        ----------
+        path: str
+            File location (not for directories)
+        replication: int
+            Number of copies of file on the cluster. Should be smaller than
+            number of data nodes; normally 3 on most systems.
+        """
+        self._call("SETREPLICATION", path=path, method="put", replication=replication)
+    def mkdir(self, path, **kwargs):
+        self._call("MKDIRS", method="put", path=path)
+    def makedirs(self, path, exist_ok=False):
+        if exist_ok is False and self.exists(path):
+            raise FileExistsError(path)
+        self.mkdir(path)
+    def mv(self, path1, path2, **kwargs):
+        self._call("RENAME", method="put", path=path1, destination=path2)
+    def rm(self, path, recursive=False, **kwargs):
+        self._call(
+            "DELETE",
+            method="delete",
+            path=path,
+            recursive="true" if recursive else "false",
+        )
+    def rm_file(self, path, **kwargs):
+        self.rm(path)
+    def cp_file(self, lpath, rpath, **kwargs):
+        with self.open(lpath) as lstream:
+            tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
+            # Perform an atomic copy (stream to a temporary file and
+            # move it to the actual destination).
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.mv(tmp_fname, rpath)
+            except BaseException:
+                with suppress(FileNotFoundError):
+                    self.rm(tmp_fname)
+                raise
+    def _apply_proxy(self, location):
+        if self.proxy and callable(self.proxy):
+            location = self.proxy(location)
+        elif self.proxy:
+            # as a dict
+            for k, v in self.proxy.items():
+                location = location.replace(k, v, 1)
+        return location
+class WebHDFile(AbstractBufferedFile):
+    """A file living in HDFS over webHDFS"""
+    def __init__(self, fs, path, **kwargs):
+        super().__init__(fs, path, **kwargs)
+        kwargs = kwargs.copy()
+        if kwargs.get("permissions", None) is None:
+            kwargs.pop("permissions", None)
+        if kwargs.get("replication", None) is None:
+            kwargs.pop("replication", None)
+        self.permissions = kwargs.pop("permissions", 511)
+        tempdir = kwargs.pop("tempdir")
+        if kwargs.pop("autocommit", False) is False:
+            self.target = self.path
+            self.path = os.path.join(tempdir, str(uuid.uuid4()))
+    def _upload_chunk(self, final=False):
+        """Write one part of a multi-block file upload
+        Parameters
+        ==========
+        final: bool
+            This is the last block, so should complete file, if
+            self.autocommit is True.
+        """
+        out = self.fs.session.post(
+            self.location,
+            data=self.buffer.getvalue(),
+            headers={"content-type": "application/octet-stream"},
+        )
+        out.raise_for_status()
+        return True
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        kwargs = self.kwargs.copy()
+        if "a" in self.mode:
+            op, method = "APPEND", "POST"
+        else:
+            op, method = "CREATE", "PUT"
+            kwargs["overwrite"] = "true"
+        out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
+        location = self.fs._apply_proxy(out.headers["Location"])
+        if "w" in self.mode:
+            # create empty file to append to
+            out2 = self.fs.session.put(
+                location, headers={"content-type": "application/octet-stream"}
+            )
+            out2.raise_for_status()
+            # after creating empty file, change location to append to
+            out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
+            self.location = self.fs._apply_proxy(out2.headers["Location"])
+    def _fetch_range(self, start, end):
+        start = max(start, 0)
+        end = min(self.size, end)
+        if start >= end or start >= self.size:
+            return b""
+        out = self.fs._call(
+            "OPEN", path=self.path, offset=start, length=end - start, redirect=False
+        )
+        out.raise_for_status()
+        if "Location" in out.headers:
+            location = out.headers["Location"]
+            out2 = self.fs.session.get(self.fs._apply_proxy(location))
+            return out2.content
+        else:
+            return out.content
+    def commit(self):
+        self.fs.mv(self.path, self.target)
+    def discard(self):
+        self.fs.rm(self.path)

.venv/lib/python3.11/site-packages/fsspec/json.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import json
+from contextlib import suppress
+from pathlib import PurePath
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+)
+from .registry import _import_class, get_filesystem_class
+from .spec import AbstractFileSystem
+class FilesystemJSONEncoder(json.JSONEncoder):
+    include_password: ClassVar[bool] = True
+    def default(self, o: Any) -> Any:
+        if isinstance(o, AbstractFileSystem):
+            return o.to_dict(include_password=self.include_password)
+        if isinstance(o, PurePath):
+            cls = type(o)
+            return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
+        return super().default(o)
+    def make_serializable(self, obj: Any) -> Any:
+        """
+        Recursively converts an object so that it can be JSON serialized via
+        :func:`json.dumps` and :func:`json.dump`, without actually calling
+        said functions.
+        """
+        if isinstance(obj, (str, int, float, bool)):
+            return obj
+        if isinstance(obj, Mapping):
+            return {k: self.make_serializable(v) for k, v in obj.items()}
+        if isinstance(obj, Sequence):
+            return [self.make_serializable(v) for v in obj]
+        return self.default(obj)
+class FilesystemJSONDecoder(json.JSONDecoder):
+    def __init__(
+        self,
+        *,
+        object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
+        parse_float: Optional[Callable[[str], Any]] = None,
+        parse_int: Optional[Callable[[str], Any]] = None,
+        parse_constant: Optional[Callable[[str], Any]] = None,
+        strict: bool = True,
+        object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
+    ) -> None:
+        self.original_object_hook = object_hook
+        super().__init__(
+            object_hook=self.custom_object_hook,
+            parse_float=parse_float,
+            parse_int=parse_int,
+            parse_constant=parse_constant,
+            strict=strict,
+            object_pairs_hook=object_pairs_hook,
+        )
+    @classmethod
+    def try_resolve_path_cls(cls, dct: Dict[str, Any]):
+        with suppress(Exception):
+            fqp = dct["cls"]
+            path_cls = _import_class(fqp)
+            if issubclass(path_cls, PurePath):
+                return path_cls
+        return None
+    @classmethod
+    def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
+        with suppress(Exception):
+            if "cls" in dct:
+                try:
+                    fs_cls = _import_class(dct["cls"])
+                    if issubclass(fs_cls, AbstractFileSystem):
+                        return fs_cls
+                except Exception:
+                    if "protocol" in dct:  # Fallback if cls cannot be imported
+                        return get_filesystem_class(dct["protocol"])
+                    raise
+        return None
+    def custom_object_hook(self, dct: Dict[str, Any]):
+        if "cls" in dct:
+            if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
+                return AbstractFileSystem.from_dict(dct)
+            if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
+                return obj_cls(dct["str"])
+        if self.original_object_hook is not None:
+            return self.original_object_hook(dct)
+        return dct
+    def unmake_serializable(self, obj: Any) -> Any:
+        """
+        Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
+        """
+        if isinstance(obj, dict):
+            obj = self.custom_object_hook(obj)
+        if isinstance(obj, dict):
+            return {k: self.unmake_serializable(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [self.unmake_serializable(v) for v in obj]
+        return obj

.venv/lib/python3.11/site-packages/fsspec/mapping.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import array
+import logging
+import posixpath
+import warnings
+from collections.abc import MutableMapping
+from functools import cached_property
+from fsspec.core import url_to_fs
+logger = logging.getLogger("fsspec.mapping")
+class FSMap(MutableMapping):
+    """Wrap a FileSystem instance as a mutable wrapping.
+    The keys of the mapping become files under the given root, and the
+    values (which must be bytes) the contents of those files.
+    Parameters
+    ----------
+    root: string
+        prefix for all the files
+    fs: FileSystem instance
+    check: bool (=True)
+        performs a touch at the location, to check for write access.
+    Examples
+    --------
+    >>> fs = FileSystem(**parameters)  # doctest: +SKIP
+    >>> d = FSMap('my-data/path/', fs)  # doctest: +SKIP
+    or, more likely
+    >>> d = fs.get_mapper('my-data/path/')
+    >>> d['loc1'] = b'Hello World'  # doctest: +SKIP
+    >>> list(d.keys())  # doctest: +SKIP
+    ['loc1']
+    >>> d['loc1']  # doctest: +SKIP
+    b'Hello World'
+    """
+    def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
+        self.fs = fs
+        self.root = fs._strip_protocol(root)
+        self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
+        if missing_exceptions is None:
+            missing_exceptions = (
+                FileNotFoundError,
+                IsADirectoryError,
+                NotADirectoryError,
+            )
+        self.missing_exceptions = missing_exceptions
+        self.check = check
+        self.create = create
+        if create:
+            if not self.fs.exists(root):
+                self.fs.mkdir(root)
+        if check:
+            if not self.fs.exists(root):
+                raise ValueError(
+                    f"Path {root} does not exist. Create "
+                    f" with the ``create=True`` keyword"
+                )
+            self.fs.touch(root + "/a")
+            self.fs.rm(root + "/a")
+    @cached_property
+    def dirfs(self):
+        """dirfs instance that can be used with the same keys as the mapper"""
+        from .implementations.dirfs import DirFileSystem
+        return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
+    def clear(self):
+        """Remove all keys below root - empties out mapping"""
+        logger.info("Clear mapping at %s", self.root)
+        try:
+            self.fs.rm(self.root, True)
+            self.fs.mkdir(self.root)
+        except:  # noqa: E722
+            pass
+    def getitems(self, keys, on_error="raise"):
+        """Fetch multiple items from the store
+        If the backend is async-able, this might proceed concurrently
+        Parameters
+        ----------
+        keys: list(str)
+            They keys to be fetched
+        on_error : "raise", "omit", "return"
+            If raise, an underlying exception will be raised (converted to KeyError
+            if the type is in self.missing_exceptions); if omit, keys with exception
+            will simply not be included in the output; if "return", all keys are
+            included in the output, but the value will be bytes or an exception
+            instance.
+        Returns
+        -------
+        dict(key, bytes|exception)
+        """
+        keys2 = [self._key_to_str(k) for k in keys]
+        oe = on_error if on_error == "raise" else "return"
+        try:
+            out = self.fs.cat(keys2, on_error=oe)
+            if isinstance(out, bytes):
+                out = {keys2[0]: out}
+        except self.missing_exceptions as e:
+            raise KeyError from e
+        out = {
+            k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
+            for k, v in out.items()
+        }
+        return {
+            key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
+            for key, k2 in zip(keys, keys2)
+            if on_error == "return" or not isinstance(out[k2], BaseException)
+        }
+    def setitems(self, values_dict):
+        """Set the values of multiple items in the store
+        Parameters
+        ----------
+        values_dict: dict(str, bytes)
+        """
+        values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
+        self.fs.pipe(values)
+    def delitems(self, keys):
+        """Remove multiple keys from the store"""
+        self.fs.rm([self._key_to_str(k) for k in keys])
+    def _key_to_str(self, key):
+        """Generate full path for the key"""
+        if not isinstance(key, str):
+            # raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
+            warnings.warn(
+                "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
+                DeprecationWarning,
+            )
+            if isinstance(key, list):
+                key = tuple(key)
+            key = str(key)
+        return f"{self._root_key_to_str}{key}".rstrip("/")
+    def _str_to_key(self, s):
+        """Strip path of to leave key name"""
+        return s[len(self.root) :].lstrip("/")
+    def __getitem__(self, key, default=None):
+        """Retrieve data"""
+        k = self._key_to_str(key)
+        try:
+            result = self.fs.cat(k)
+        except self.missing_exceptions as exc:
+            if default is not None:
+                return default
+            raise KeyError(key) from exc
+        return result
+    def pop(self, key, default=None):
+        """Pop data"""
+        result = self.__getitem__(key, default)
+        try:
+            del self[key]
+        except KeyError:
+            pass
+        return result
+    def __setitem__(self, key, value):
+        """Store value in key"""
+        key = self._key_to_str(key)
+        self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
+        self.fs.pipe_file(key, maybe_convert(value))
+    def __iter__(self):
+        return (self._str_to_key(x) for x in self.fs.find(self.root))
+    def __len__(self):
+        return len(self.fs.find(self.root))
+    def __delitem__(self, key):
+        """Remove key"""
+        try:
+            self.fs.rm(self._key_to_str(key))
+        except Exception as exc:
+            raise KeyError from exc
+    def __contains__(self, key):
+        """Does key exist in mapping?"""
+        path = self._key_to_str(key)
+        return self.fs.isfile(path)
+    def __reduce__(self):
+        return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
+def maybe_convert(value):
+    if isinstance(value, array.array) or hasattr(value, "__array__"):
+        # bytes-like things
+        if hasattr(value, "dtype") and value.dtype.kind in "Mm":
+            # The buffer interface doesn't support datetime64/timdelta64 numpy
+            # arrays
+            value = value.view("int64")
+        value = bytes(memoryview(value))
+    return value
+def get_mapper(
+    url="",
+    check=False,
+    create=False,
+    missing_exceptions=None,
+    alternate_root=None,
+    **kwargs,
+):
+    """Create key-value interface for given URL and options
+    The URL will be of the form "protocol://location" and point to the root
+    of the mapper required. All keys will be file-names below this location,
+    and their values the contents of each key.
+    Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
+    Parameters
+    ----------
+    url: str
+        Root URL of mapping
+    check: bool
+        Whether to attempt to read from the location before instantiation, to
+        check that the mapping does exist
+    create: bool
+        Whether to make the directory corresponding to the root before
+        instantiating
+    missing_exceptions: None or tuple
+        If given, these exception types will be regarded as missing keys and
+        return KeyError when trying to read data. By default, you get
+        (FileNotFoundError, IsADirectoryError, NotADirectoryError)
+    alternate_root: None or str
+        In cases of complex URLs, the parser may fail to pick the correct part
+        for the mapper root, so this arg can override
+    Returns
+    -------
+    ``FSMap`` instance, the dict-like key-value store.
+    """
+    # Removing protocol here - could defer to each open() on the backend
+    fs, urlpath = url_to_fs(url, **kwargs)
+    root = alternate_root if alternate_root is not None else urlpath
+    return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)

.venv/lib/python3.11/site-packages/fsspec/parquet.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import io
+import json
+import warnings
+from .core import url_to_fs
+from .utils import merge_offset_ranges
+# Parquet-Specific Utilities for fsspec
+#
+# Most of the functions defined in this module are NOT
+# intended for public consumption. The only exception
+# to this is `open_parquet_file`, which should be used
+# place of `fs.open()` to open parquet-formatted files
+# on remote file systems.
+def open_parquet_file(
+    path,
+    mode="rb",
+    fs=None,
+    metadata=None,
+    columns=None,
+    row_groups=None,
+    storage_options=None,
+    strict=False,
+    engine="auto",
+    max_gap=64_000,
+    max_block=256_000_000,
+    footer_sample_size=1_000_000,
+    **kwargs,
+):
+    """
+    Return a file-like object for a single Parquet file.
+    The specified parquet `engine` will be used to parse the
+    footer metadata, and determine the required byte ranges
+    from the file. The target path will then be opened with
+    the "parts" (`KnownPartsOfAFile`) caching strategy.
+    Note that this method is intended for usage with remote
+    file systems, and is unlikely to improve parquet-read
+    performance on local file systems.
+    Parameters
+    ----------
+    path: str
+        Target file path.
+    mode: str, optional
+        Mode option to be passed through to `fs.open`. Default is "rb".
+    metadata: Any, optional
+        Parquet metadata object. Object type must be supported
+        by the backend parquet engine. For now, only the "fastparquet"
+        engine supports an explicit `ParquetFile` metadata object.
+        If a metadata object is supplied, the remote footer metadata
+        will not need to be transferred into local memory.
+    fs: AbstractFileSystem, optional
+        Filesystem object to use for opening the file. If nothing is
+        specified, an `AbstractFileSystem` object will be inferred.
+    engine : str, default "auto"
+        Parquet engine to use for metadata parsing. Allowed options
+        include "fastparquet", "pyarrow", and "auto". The specified
+        engine must be installed in the current environment. If
+        "auto" is specified, and both engines are installed,
+        "fastparquet" will take precedence over "pyarrow".
+    columns: list, optional
+        List of all column names that may be read from the file.
+    row_groups : list, optional
+        List of all row-groups that may be read from the file. This
+        may be a list of row-group indices (integers), or it may be
+        a list of `RowGroup` metadata objects (if the "fastparquet"
+        engine is used).
+    storage_options : dict, optional
+        Used to generate an `AbstractFileSystem` object if `fs` was
+        not specified.
+    strict : bool, optional
+        Whether the resulting `KnownPartsOfAFile` cache should
+        fetch reads that go beyond a known byte-range boundary.
+        If `False` (the default), any read that ends outside a
+        known part will be zero padded. Note that using
+        `strict=True` may be useful for debugging.
+    max_gap : int, optional
+        Neighboring byte ranges will only be merged when their
+        inter-range gap is <= `max_gap`. Default is 64KB.
+    max_block : int, optional
+        Neighboring byte ranges will only be merged when the size of
+        the aggregated range is <= `max_block`. Default is 256MB.
+    footer_sample_size : int, optional
+        Number of bytes to read from the end of the path to look
+        for the footer metadata. If the sampled bytes do not contain
+        the footer, a second read request will be required, and
+        performance will suffer. Default is 1MB.
+    **kwargs :
+        Optional key-word arguments to pass to `fs.open`
+    """
+    # Make sure we have an `AbstractFileSystem` object
+    # to work with
+    if fs is None:
+        fs = url_to_fs(path, **(storage_options or {}))[0]
+    # For now, `columns == []` not supported. Just use
+    # default `open` command with `path` input
+    if columns is not None and len(columns) == 0:
+        return fs.open(path, mode=mode)
+    # Set the engine
+    engine = _set_engine(engine)
+    # Fetch the known byte ranges needed to read
+    # `columns` and/or `row_groups`
+    data = _get_parquet_byte_ranges(
+        [path],
+        fs,
+        metadata=metadata,
+        columns=columns,
+        row_groups=row_groups,
+        engine=engine,
+        max_gap=max_gap,
+        max_block=max_block,
+        footer_sample_size=footer_sample_size,
+    )
+    # Extract file name from `data`
+    fn = next(iter(data)) if data else path
+    # Call self.open with "parts" caching
+    options = kwargs.pop("cache_options", {}).copy()
+    return fs.open(
+        fn,
+        mode=mode,
+        cache_type="parts",
+        cache_options={
+            **options,
+            "data": data.get(fn, {}),
+            "strict": strict,
+        },
+        **kwargs,
+    )
+def _get_parquet_byte_ranges(
+    paths,
+    fs,
+    metadata=None,
+    columns=None,
+    row_groups=None,
+    max_gap=64_000,
+    max_block=256_000_000,
+    footer_sample_size=1_000_000,
+    engine="auto",
+):
+    """Get a dictionary of the known byte ranges needed
+    to read a specific column/row-group selection from a
+    Parquet dataset. Each value in the output dictionary
+    is intended for use as the `data` argument for the
+    `KnownPartsOfAFile` caching strategy of a single path.
+    """
+    # Set engine if necessary
+    if isinstance(engine, str):
+        engine = _set_engine(engine)
+    # Pass to specialized function if metadata is defined
+    if metadata is not None:
+        # Use the provided parquet metadata object
+        # to avoid transferring/parsing footer metadata
+        return _get_parquet_byte_ranges_from_metadata(
+            metadata,
+            fs,
+            engine,
+            columns=columns,
+            row_groups=row_groups,
+            max_gap=max_gap,
+            max_block=max_block,
+        )
+    # Get file sizes asynchronously
+    file_sizes = fs.sizes(paths)
+    # Populate global paths, starts, & ends
+    result = {}
+    data_paths = []
+    data_starts = []
+    data_ends = []
+    add_header_magic = True
+    if columns is None and row_groups is None:
+        # We are NOT selecting specific columns or row-groups.
+        #
+        # We can avoid sampling the footers, and just transfer
+        # all file data with cat_ranges
+        for i, path in enumerate(paths):
+            result[path] = {}
+            for b in range(0, file_sizes[i], max_block):
+                data_paths.append(path)
+                data_starts.append(b)
+                data_ends.append(min(b + max_block, file_sizes[i]))
+        add_header_magic = False  # "Magic" should already be included
+    else:
+        # We ARE selecting specific columns or row-groups.
+        #
+        # Gather file footers.
+        # We just take the last `footer_sample_size` bytes of each
+        # file (or the entire file if it is smaller than that)
+        footer_starts = []
+        footer_ends = []
+        for i, path in enumerate(paths):
+            footer_ends.append(file_sizes[i])
+            sample_size = max(0, file_sizes[i] - footer_sample_size)
+            footer_starts.append(sample_size)
+        footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
+        # Check our footer samples and re-sample if necessary.
+        missing_footer_starts = footer_starts.copy()
+        large_footer = 0
+        for i, path in enumerate(paths):
+            footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
+            real_footer_start = file_sizes[i] - (footer_size + 8)
+            if real_footer_start < footer_starts[i]:
+                missing_footer_starts[i] = real_footer_start
+                large_footer = max(large_footer, (footer_size + 8))
+        if large_footer:
+            warnings.warn(
+                f"Not enough data was used to sample the parquet footer. "
+                f"Try setting footer_sample_size >= {large_footer}."
+            )
+            for i, block in enumerate(
+                fs.cat_ranges(
+                    paths,
+                    missing_footer_starts,
+                    footer_starts,
+                )
+            ):
+                footer_samples[i] = block + footer_samples[i]
+                footer_starts[i] = missing_footer_starts[i]
+        # Calculate required byte ranges for each path
+        for i, path in enumerate(paths):
+            # Deal with small-file case.
+            # Just include all remaining bytes of the file
+            # in a single range.
+            if file_sizes[i] < max_block:
+                if footer_starts[i] > 0:
+                    # Only need to transfer the data if the
+                    # footer sample isn't already the whole file
+                    data_paths.append(path)
+                    data_starts.append(0)
+                    data_ends.append(footer_starts[i])
+                continue
+            # Use "engine" to collect data byte ranges
+            path_data_starts, path_data_ends = engine._parquet_byte_ranges(
+                columns,
+                row_groups=row_groups,
+                footer=footer_samples[i],
+                footer_start=footer_starts[i],
+            )
+            data_paths += [path] * len(path_data_starts)
+            data_starts += path_data_starts
+            data_ends += path_data_ends
+        # Merge adjacent offset ranges
+        data_paths, data_starts, data_ends = merge_offset_ranges(
+            data_paths,
+            data_starts,
+            data_ends,
+            max_gap=max_gap,
+            max_block=max_block,
+            sort=False,  # Should already be sorted
+        )
+        # Start by populating `result` with footer samples
+        for i, path in enumerate(paths):
+            result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
+    # Transfer the data byte-ranges into local memory
+    _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
+    # Add b"PAR1" to header if necessary
+    if add_header_magic:
+        _add_header_magic(result)
+    return result
+def _get_parquet_byte_ranges_from_metadata(
+    metadata,
+    fs,
+    engine,
+    columns=None,
+    row_groups=None,
+    max_gap=64_000,
+    max_block=256_000_000,
+):
+    """Simplified version of `_get_parquet_byte_ranges` for
+    the case that an engine-specific `metadata` object is
+    provided, and the remote footer metadata does not need to
+    be transferred before calculating the required byte ranges.
+    """
+    # Use "engine" to collect data byte ranges
+    data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
+        columns,
+        row_groups=row_groups,
+        metadata=metadata,
+    )
+    # Merge adjacent offset ranges
+    data_paths, data_starts, data_ends = merge_offset_ranges(
+        data_paths,
+        data_starts,
+        data_ends,
+        max_gap=max_gap,
+        max_block=max_block,
+        sort=False,  # Should be sorted
+    )
+    # Transfer the data byte-ranges into local memory
+    result = {fn: {} for fn in list(set(data_paths))}
+    _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
+    # Add b"PAR1" to header
+    _add_header_magic(result)
+    return result
+def _transfer_ranges(fs, blocks, paths, starts, ends):
+    # Use cat_ranges to gather the data byte_ranges
+    ranges = (paths, starts, ends)
+    for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
+        blocks[path][(start, stop)] = data
+def _add_header_magic(data):
+    # Add b"PAR1" to file headers
+    for path in list(data.keys()):
+        add_magic = True
+        for k in data[path]:
+            if k[0] == 0 and k[1] >= 4:
+                add_magic = False
+                break
+        if add_magic:
+            data[path][(0, 4)] = b"PAR1"
+def _set_engine(engine_str):
+    # Define a list of parquet engines to try
+    if engine_str == "auto":
+        try_engines = ("fastparquet", "pyarrow")
+    elif not isinstance(engine_str, str):
+        raise ValueError(
+            "Failed to set parquet engine! "
+            "Please pass 'fastparquet', 'pyarrow', or 'auto'"
+        )
+    elif engine_str not in ("fastparquet", "pyarrow"):
+        raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
+    else:
+        try_engines = [engine_str]
+    # Try importing the engines in `try_engines`,
+    # and choose the first one that succeeds
+    for engine in try_engines:
+        try:
+            if engine == "fastparquet":
+                return FastparquetEngine()
+            elif engine == "pyarrow":
+                return PyarrowEngine()
+        except ImportError:
+            pass
+    # Raise an error if a supported parquet engine
+    # was not found
+    raise ImportError(
+        f"The following parquet engines are not installed "
+        f"in your python environment: {try_engines}."
+        f"Please install 'fastparquert' or 'pyarrow' to "
+        f"utilize the `fsspec.parquet` module."
+    )
+class FastparquetEngine:
+    # The purpose of the FastparquetEngine class is
+    # to check if fastparquet can be imported (on initialization)
+    # and to define a `_parquet_byte_ranges` method. In the
+    # future, this class may also be used to define other
+    # methods/logic that are specific to fastparquet.
+    def __init__(self):
+        import fastparquet as fp
+        self.fp = fp
+    def _row_group_filename(self, row_group, pf):
+        return pf.row_group_filename(row_group)
+    def _parquet_byte_ranges(
+        self,
+        columns,
+        row_groups=None,
+        metadata=None,
+        footer=None,
+        footer_start=None,
+    ):
+        # Initialize offset ranges and define ParqetFile metadata
+        pf = metadata
+        data_paths, data_starts, data_ends = [], [], []
+        if pf is None:
+            pf = self.fp.ParquetFile(io.BytesIO(footer))
+        # Convert columns to a set and add any index columns
+        # specified in the pandas metadata (just in case)
+        column_set = None if columns is None else set(columns)
+        if column_set is not None and hasattr(pf, "pandas_metadata"):
+            md_index = [
+                ind
+                for ind in pf.pandas_metadata.get("index_columns", [])
+                # Ignore RangeIndex information
+                if not isinstance(ind, dict)
+            ]
+            column_set |= set(md_index)
+        # Check if row_groups is a list of integers
+        # or a list of row-group metadata
+        if row_groups and not isinstance(row_groups[0], int):
+            # Input row_groups contains row-group metadata
+            row_group_indices = None
+        else:
+            # Input row_groups contains row-group indices
+            row_group_indices = row_groups
+            row_groups = pf.row_groups
+        # Loop through column chunks to add required byte ranges
+        for r, row_group in enumerate(row_groups):
+            # Skip this row-group if we are targeting
+            # specific row-groups
+            if row_group_indices is None or r in row_group_indices:
+                # Find the target parquet-file path for `row_group`
+                fn = self._row_group_filename(row_group, pf)
+                for column in row_group.columns:
+                    name = column.meta_data.path_in_schema[0]
+                    # Skip this column if we are targeting a
+                    # specific columns
+                    if column_set is None or name in column_set:
+                        file_offset0 = column.meta_data.dictionary_page_offset
+                        if file_offset0 is None:
+                            file_offset0 = column.meta_data.data_page_offset
+                        num_bytes = column.meta_data.total_compressed_size
+                        if footer_start is None or file_offset0 < footer_start:
+                            data_paths.append(fn)
+                            data_starts.append(file_offset0)
+                            data_ends.append(
+                                min(
+                                    file_offset0 + num_bytes,
+                                    footer_start or (file_offset0 + num_bytes),
+                                )
+                            )
+        if metadata:
+            # The metadata in this call may map to multiple
+            # file paths. Need to include `data_paths`
+            return data_paths, data_starts, data_ends
+        return data_starts, data_ends
+class PyarrowEngine:
+    # The purpose of the PyarrowEngine class is
+    # to check if pyarrow can be imported (on initialization)
+    # and to define a `_parquet_byte_ranges` method. In the
+    # future, this class may also be used to define other
+    # methods/logic that are specific to pyarrow.
+    def __init__(self):
+        import pyarrow.parquet as pq
+        self.pq = pq
+    def _row_group_filename(self, row_group, metadata):
+        raise NotImplementedError
+    def _parquet_byte_ranges(
+        self,
+        columns,
+        row_groups=None,
+        metadata=None,
+        footer=None,
+        footer_start=None,
+    ):
+        if metadata is not None:
+            raise ValueError("metadata input not supported for PyarrowEngine")
+        data_starts, data_ends = [], []
+        md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
+        # Convert columns to a set and add any index columns
+        # specified in the pandas metadata (just in case)
+        column_set = None if columns is None else set(columns)
+        if column_set is not None:
+            schema = md.schema.to_arrow_schema()
+            has_pandas_metadata = (
+                schema.metadata is not None and b"pandas" in schema.metadata
+            )
+            if has_pandas_metadata:
+                md_index = [
+                    ind
+                    for ind in json.loads(
+                        schema.metadata[b"pandas"].decode("utf8")
+                    ).get("index_columns", [])
+                    # Ignore RangeIndex information
+                    if not isinstance(ind, dict)
+                ]
+                column_set |= set(md_index)
+        # Loop through column chunks to add required byte ranges
+        for r in range(md.num_row_groups):
+            # Skip this row-group if we are targeting
+            # specific row-groups
+            if row_groups is None or r in row_groups:
+                row_group = md.row_group(r)
+                for c in range(row_group.num_columns):
+                    column = row_group.column(c)
+                    name = column.path_in_schema
+                    # Skip this column if we are targeting a
+                    # specific columns
+                    split_name = name.split(".")[0]
+                    if (
+                        column_set is None
+                        or name in column_set
+                        or split_name in column_set
+                    ):
+                        file_offset0 = column.dictionary_page_offset
+                        if file_offset0 is None:
+                            file_offset0 = column.data_page_offset
+                        num_bytes = column.total_compressed_size
+                        if file_offset0 < footer_start:
+                            data_starts.append(file_offset0)
+                            data_ends.append(
+                                min(file_offset0 + num_bytes, footer_start)
+                            )
+        return data_starts, data_ends

.venv/lib/python3.11/site-packages/fsspec/registry.py ADDED Viewed

	@@ -0,0 +1,315 @@

+from __future__ import annotations
+import importlib
+import types
+import warnings
+__all__ = ["registry", "get_filesystem_class", "default"]
+# internal, mutable
+_registry: dict[str, type] = {}
+# external, immutable
+registry = types.MappingProxyType(_registry)
+default = "file"
+def register_implementation(name, cls, clobber=False, errtxt=None):
+    """Add implementation class to the registry
+    Parameters
+    ----------
+    name: str
+        Protocol name to associate with the class
+    cls: class or str
+        if a class: fsspec-compliant implementation class (normally inherits from
+        ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
+        str, the full path to an implementation class like package.module.class,
+        which gets added to known_implementations,
+        so the import is deferred until the filesystem is actually used.
+    clobber: bool (optional)
+        Whether to overwrite a protocol with the same name; if False, will raise
+        instead.
+    errtxt: str (optional)
+        If given, then a failure to import the given class will result in this
+        text being given.
+    """
+    if isinstance(cls, str):
+        if name in known_implementations and clobber is False:
+            if cls != known_implementations[name]["class"]:
+                raise ValueError(
+                    f"Name ({name}) already in the known_implementations and clobber "
+                    f"is False"
+                )
+        else:
+            known_implementations[name] = {
+                "class": cls,
+                "err": errtxt or f"{cls} import failed for protocol {name}",
+            }
+    else:
+        if name in registry and clobber is False:
+            if _registry[name] is not cls:
+                raise ValueError(
+                    f"Name ({name}) already in the registry and clobber is False"
+                )
+        else:
+            _registry[name] = cls
+# protocols mapped to the class which implements them. This dict can be
+# updated with register_implementation
+known_implementations = {
+    "abfs": {
+        "class": "adlfs.AzureBlobFileSystem",
+        "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
+    },
+    "adl": {
+        "class": "adlfs.AzureDatalakeFileSystem",
+        "err": "Install adlfs to access Azure Datalake Gen1",
+    },
+    "arrow_hdfs": {
+        "class": "fsspec.implementations.arrow.HadoopFileSystem",
+        "err": "pyarrow and local java libraries required for HDFS",
+    },
+    "asynclocal": {
+        "class": "morefs.asyn_local.AsyncLocalFileSystem",
+        "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
+    },
+    "az": {
+        "class": "adlfs.AzureBlobFileSystem",
+        "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
+    },
+    "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
+    "box": {
+        "class": "boxfs.BoxFileSystem",
+        "err": "Please install boxfs to access BoxFileSystem",
+    },
+    "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
+    "dask": {
+        "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
+        "err": "Install dask distributed to access worker file system",
+    },
+    "data": {"class": "fsspec.implementations.data.DataFileSystem"},
+    "dbfs": {
+        "class": "fsspec.implementations.dbfs.DatabricksFileSystem",
+        "err": "Install the requests package to use the DatabricksFileSystem",
+    },
+    "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
+    "dropbox": {
+        "class": "dropboxdrivefs.DropboxDriveFileSystem",
+        "err": (
+            'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
+            '"dropbox" to be installed'
+        ),
+    },
+    "dvc": {
+        "class": "dvc.api.DVCFileSystem",
+        "err": "Install dvc to access DVCFileSystem",
+    },
+    "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
+    "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
+    "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
+    "gcs": {
+        "class": "gcsfs.GCSFileSystem",
+        "err": "Please install gcsfs to access Google Storage",
+    },
+    "gdrive": {
+        "class": "gdrivefs.GoogleDriveFileSystem",
+        "err": "Please install gdrivefs for access to Google Drive",
+    },
+    "generic": {"class": "fsspec.generic.GenericFileSystem"},
+    "git": {
+        "class": "fsspec.implementations.git.GitFileSystem",
+        "err": "Install pygit2 to browse local git repos",
+    },
+    "github": {
+        "class": "fsspec.implementations.github.GithubFileSystem",
+        "err": "Install the requests package to use the github FS",
+    },
+    "gs": {
+        "class": "gcsfs.GCSFileSystem",
+        "err": "Please install gcsfs to access Google Storage",
+    },
+    "hdfs": {
+        "class": "fsspec.implementations.arrow.HadoopFileSystem",
+        "err": "pyarrow and local java libraries required for HDFS",
+    },
+    "hf": {
+        "class": "huggingface_hub.HfFileSystem",
+        "err": "Install huggingface_hub to access HfFileSystem",
+    },
+    "http": {
+        "class": "fsspec.implementations.http.HTTPFileSystem",
+        "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
+    },
+    "https": {
+        "class": "fsspec.implementations.http.HTTPFileSystem",
+        "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
+    },
+    "jlab": {
+        "class": "fsspec.implementations.jupyter.JupyterFileSystem",
+        "err": "Jupyter FS requires requests to be installed",
+    },
+    "jupyter": {
+        "class": "fsspec.implementations.jupyter.JupyterFileSystem",
+        "err": "Jupyter FS requires requests to be installed",
+    },
+    "lakefs": {
+        "class": "lakefs_spec.LakeFSFileSystem",
+        "err": "Please install lakefs-spec to access LakeFSFileSystem",
+    },
+    "libarchive": {
+        "class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
+        "err": "LibArchive requires to be installed",
+    },
+    "local": {"class": "fsspec.implementations.local.LocalFileSystem"},
+    "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
+    "oci": {
+        "class": "ocifs.OCIFileSystem",
+        "err": "Install ocifs to access OCI Object Storage",
+    },
+    "ocilake": {
+        "class": "ocifs.OCIFileSystem",
+        "err": "Install ocifs to access OCI Data Lake",
+    },
+    "oss": {
+        "class": "ossfs.OSSFileSystem",
+        "err": "Install ossfs to access Alibaba Object Storage System",
+    },
+    "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
+    "root": {
+        "class": "fsspec_xrootd.XRootDFileSystem",
+        "err": (
+            "Install fsspec-xrootd to access xrootd storage system. "
+            "Note: 'root' is the protocol name for xrootd storage systems, "
+            "not referring to root directories"
+        ),
+    },
+    "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
+    "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
+    "sftp": {
+        "class": "fsspec.implementations.sftp.SFTPFileSystem",
+        "err": 'SFTPFileSystem requires "paramiko" to be installed',
+    },
+    "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
+    "smb": {
+        "class": "fsspec.implementations.smb.SMBFileSystem",
+        "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
+    },
+    "ssh": {
+        "class": "fsspec.implementations.sftp.SFTPFileSystem",
+        "err": 'SFTPFileSystem requires "paramiko" to be installed',
+    },
+    "tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
+    "tosfs": {
+        "class": "tosfs.TosFileSystem",
+        "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
+    },
+    "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
+    "webdav": {
+        "class": "webdav4.fsspec.WebdavFileSystem",
+        "err": "Install webdav4 to access WebDAV",
+    },
+    "webhdfs": {
+        "class": "fsspec.implementations.webhdfs.WebHDFS",
+        "err": 'webHDFS access requires "requests" to be installed',
+    },
+    "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
+}
+assert list(known_implementations) == sorted(known_implementations), (
+    "Not in alphabetical order"
+)
+def get_filesystem_class(protocol):
+    """Fetch named protocol implementation from the registry
+    The dict ``known_implementations`` maps protocol names to the locations
+    of classes implementing the corresponding file-system. When used for the
+    first time, appropriate imports will happen and the class will be placed in
+    the registry. All subsequent calls will fetch directly from the registry.
+    Some protocol implementations require additional dependencies, and so the
+    import may fail. In this case, the string in the "err" field of the
+    ``known_implementations`` will be given as the error message.
+    """
+    if not protocol:
+        protocol = default
+    if protocol not in registry:
+        if protocol not in known_implementations:
+            raise ValueError(f"Protocol not known: {protocol}")
+        bit = known_implementations[protocol]
+        try:
+            register_implementation(protocol, _import_class(bit["class"]))
+        except ImportError as e:
+            raise ImportError(bit["err"]) from e
+    cls = registry[protocol]
+    if getattr(cls, "protocol", None) in ("abstract", None):
+        cls.protocol = protocol
+    return cls
+s3_msg = """Your installed version of s3fs is very old and known to cause
+severe performance issues, see also https://github.com/dask/dask/issues/10276
+To fix, you should specify a lower version bound on s3fs, or
+update the current installation.
+"""
+def _import_class(fqp: str):
+    """Take a fully-qualified path and return the imported class or identifier.
+    ``fqp`` is of the form "package.module.klass" or
+    "package.module:subobject.klass".
+    Warnings
+    --------
+    This can import arbitrary modules. Make sure you haven't installed any modules
+    that may execute malicious code at import time.
+    """
+    if ":" in fqp:
+        mod, name = fqp.rsplit(":", 1)
+    else:
+        mod, name = fqp.rsplit(".", 1)
+    is_s3 = mod == "s3fs"
+    mod = importlib.import_module(mod)
+    if is_s3 and mod.__version__.split(".") < ["0", "5"]:
+        warnings.warn(s3_msg)
+    for part in name.split("."):
+        mod = getattr(mod, part)
+    if not isinstance(mod, type):
+        raise TypeError(f"{fqp} is not a class")
+    return mod
+def filesystem(protocol, **storage_options):
+    """Instantiate filesystems for given protocol and arguments
+    ``storage_options`` are specific to the protocol being chosen, and are
+    passed directly to the class.
+    """
+    if protocol == "arrow_hdfs":
+        warnings.warn(
+            "The 'arrow_hdfs' protocol has been deprecated and will be "
+            "removed in the future. Specify it as 'hdfs'.",
+            DeprecationWarning,
+        )
+    cls = get_filesystem_class(protocol)
+    return cls(**storage_options)
+def available_protocols():
+    """Return a list of the implemented protocols.
+    Note that any given protocol may require extra packages to be importable.
+    """
+    return list(known_implementations)

.venv/lib/python3.11/site-packages/fsspec/spec.py ADDED Viewed

	@@ -0,0 +1,2242 @@

+from __future__ import annotations
+import io
+import json
+import logging
+import os
+import threading
+import warnings
+import weakref
+from errno import ESPIPE
+from glob import has_magic
+from hashlib import sha256
+from typing import Any, ClassVar
+from .callbacks import DEFAULT_CALLBACK
+from .config import apply_config, conf
+from .dircache import DirCache
+from .transaction import Transaction
+from .utils import (
+    _unstrip_protocol,
+    glob_translate,
+    isfilelike,
+    other_paths,
+    read_block,
+    stringify_path,
+    tokenize,
+)
+logger = logging.getLogger("fsspec")
+def make_instance(cls, args, kwargs):
+    return cls(*args, **kwargs)
+class _Cached(type):
+    """
+    Metaclass for caching file system instances.
+    Notes
+    -----
+    Instances are cached according to
+    * The values of the class attributes listed in `_extra_tokenize_attributes`
+    * The arguments passed to ``__init__``.
+    This creates an additional reference to the filesystem, which prevents the
+    filesystem from being garbage collected when all *user* references go away.
+    A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
+    be made for a filesystem instance to be garbage collected.
+    """
+    def __init__(cls, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Note: we intentionally create a reference here, to avoid garbage
+        # collecting instances when all other references are gone. To really
+        # delete a FileSystem, the cache must be cleared.
+        if conf.get("weakref_instance_cache"):  # pragma: no cover
+            # debug option for analysing fork/spawn conditions
+            cls._cache = weakref.WeakValueDictionary()
+        else:
+            cls._cache = {}
+        cls._pid = os.getpid()
+    def __call__(cls, *args, **kwargs):
+        kwargs = apply_config(cls, kwargs)
+        extra_tokens = tuple(
+            getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
+        )
+        token = tokenize(
+            cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
+        )
+        skip = kwargs.pop("skip_instance_cache", False)
+        if os.getpid() != cls._pid:
+            cls._cache.clear()
+            cls._pid = os.getpid()
+        if not skip and cls.cachable and token in cls._cache:
+            cls._latest = token
+            return cls._cache[token]
+        else:
+            obj = super().__call__(*args, **kwargs)
+            # Setting _fs_token here causes some static linters to complain.
+            obj._fs_token_ = token
+            obj.storage_args = args
+            obj.storage_options = kwargs
+            if obj.async_impl and obj.mirror_sync_methods:
+                from .asyn import mirror_sync_methods
+                mirror_sync_methods(obj)
+            if cls.cachable and not skip:
+                cls._latest = token
+                cls._cache[token] = obj
+            return obj
+class AbstractFileSystem(metaclass=_Cached):
+    """
+    An abstract super-class for pythonic file-systems
+    Implementations are expected to be compatible with or, better, subclass
+    from here.
+    """
+    cachable = True  # this class can be cached, instances reused
+    _cached = False
+    blocksize = 2**22
+    sep = "/"
+    protocol: ClassVar[str | tuple[str, ...]] = "abstract"
+    _latest = None
+    async_impl = False
+    mirror_sync_methods = False
+    root_marker = ""  # For some FSs, may require leading '/' or other character
+    transaction_type = Transaction
+    #: Extra *class attributes* that should be considered when hashing.
+    _extra_tokenize_attributes = ()
+    # Set by _Cached metaclass
+    storage_args: tuple[Any, ...]
+    storage_options: dict[str, Any]
+    def __init__(self, *args, **storage_options):
+        """Create and configure file-system instance
+        Instances may be cachable, so if similar enough arguments are seen
+        a new instance is not required. The token attribute exists to allow
+        implementations to cache instances if they wish.
+        A reasonable default should be provided if there are no arguments.
+        Subclasses should call this method.
+        Parameters
+        ----------
+        use_listings_cache, listings_expiry_time, max_paths:
+            passed to ``DirCache``, if the implementation supports
+            directory listing caching. Pass use_listings_cache=False
+            to disable such caching.
+        skip_instance_cache: bool
+            If this is a cachable implementation, pass True here to force
+            creating a new instance even if a matching instance exists, and prevent
+            storing this instance.
+        asynchronous: bool
+        loop: asyncio-compatible IOLoop or None
+        """
+        if self._cached:
+            # reusing instance, don't change
+            return
+        self._cached = True
+        self._intrans = False
+        self._transaction = None
+        self._invalidated_caches_in_transaction = []
+        self.dircache = DirCache(**storage_options)
+        if storage_options.pop("add_docs", None):
+            warnings.warn("add_docs is no longer supported.", FutureWarning)
+        if storage_options.pop("add_aliases", None):
+            warnings.warn("add_aliases has been removed.", FutureWarning)
+        # This is set in _Cached
+        self._fs_token_ = None
+    @property
+    def fsid(self):
+        """Persistent filesystem id that can be used to compare filesystems
+        across sessions.
+        """
+        raise NotImplementedError
+    @property
+    def _fs_token(self):
+        return self._fs_token_
+    def __dask_tokenize__(self):
+        return self._fs_token
+    def __hash__(self):
+        return int(self._fs_token, 16)
+    def __eq__(self, other):
+        return isinstance(other, type(self)) and self._fs_token == other._fs_token
+    def __reduce__(self):
+        return make_instance, (type(self), self.storage_args, self.storage_options)
+    @classmethod
+    def _strip_protocol(cls, path):
+        """Turn path from fully-qualified to file-system-specific
+        May require FS-specific handling, e.g., for relative paths or links.
+        """
+        if isinstance(path, list):
+            return [cls._strip_protocol(p) for p in path]
+        path = stringify_path(path)
+        protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
+        for protocol in protos:
+            if path.startswith(protocol + "://"):
+                path = path[len(protocol) + 3 :]
+            elif path.startswith(protocol + "::"):
+                path = path[len(protocol) + 2 :]
+        path = path.rstrip("/")
+        # use of root_marker to make minimum required path, e.g., "/"
+        return path or cls.root_marker
+    def unstrip_protocol(self, name: str) -> str:
+        """Format FS-specific path to generic, including protocol"""
+        protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
+        for protocol in protos:
+            if name.startswith(f"{protocol}://"):
+                return name
+        return f"{protos[0]}://{name}"
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        """If kwargs can be encoded in the paths, extract them here
+        This should happen before instantiation of the class; incoming paths
+        then should be amended to strip the options in methods.
+        Examples may look like an sftp path "sftp://user@host:/my/path", where
+        the user and host should become kwargs and later get stripped.
+        """
+        # by default, nothing happens
+        return {}
+    @classmethod
+    def current(cls):
+        """Return the most recently instantiated FileSystem
+        If no instance has been created, then create one with defaults
+        """
+        if cls._latest in cls._cache:
+            return cls._cache[cls._latest]
+        return cls()
+    @property
+    def transaction(self):
+        """A context within which files are committed together upon exit
+        Requires the file class to implement `.commit()` and `.discard()`
+        for the normal and exception cases.
+        """
+        if self._transaction is None:
+            self._transaction = self.transaction_type(self)
+        return self._transaction
+    def start_transaction(self):
+        """Begin write transaction for deferring files, non-context version"""
+        self._intrans = True
+        self._transaction = self.transaction_type(self)
+        return self.transaction
+    def end_transaction(self):
+        """Finish write transaction, non-context version"""
+        self.transaction.complete()
+        self._transaction = None
+        # The invalid cache must be cleared after the transaction is completed.
+        for path in self._invalidated_caches_in_transaction:
+            self.invalidate_cache(path)
+        self._invalidated_caches_in_transaction.clear()
+    def invalidate_cache(self, path=None):
+        """
+        Discard any cached directory information
+        Parameters
+        ----------
+        path: string or None
+            If None, clear all listings cached else listings at or under given
+            path.
+        """
+        # Not necessary to implement invalidation mechanism, may have no cache.
+        # But if have, you should call this method of parent class from your
+        # subclass to ensure expiring caches after transacations correctly.
+        # See the implementation of FTPFileSystem in ftp.py
+        if self._intrans:
+            self._invalidated_caches_in_transaction.append(path)
+    def mkdir(self, path, create_parents=True, **kwargs):
+        """
+        Create directory entry at path
+        For systems that don't have true directories, may create an for
+        this instance only and not touch the real filesystem
+        Parameters
+        ----------
+        path: str
+            location
+        create_parents: bool
+            if True, this is equivalent to ``makedirs``
+        kwargs:
+            may be permissions, etc.
+        """
+        pass  # not necessary to implement, may not have directories
+    def makedirs(self, path, exist_ok=False):
+        """Recursively make directories
+        Creates directory at path and any intervening required directories.
+        Raises exception if, for instance, the path already exists but is a
+        file.
+        Parameters
+        ----------
+        path: str
+            leaf directory name
+        exist_ok: bool (False)
+            If False, will error if the target already exists
+        """
+        pass  # not necessary to implement, may not have directories
+    def rmdir(self, path):
+        """Remove a directory, if empty"""
+        pass  # not necessary to implement, may not have directories
+    def ls(self, path, detail=True, **kwargs):
+        """List objects at path.
+        This should include subdirectories and files at that location. The
+        difference between a file and a directory must be clear when details
+        are requested.
+        The specific keys, or perhaps a FileInfo class, or similar, is TBD,
+        but must be consistent across implementations.
+        Must include:
+        - full path to the entry (without protocol)
+        - size of the entry, in bytes. If the value cannot be determined, will
+          be ``None``.
+        - type of entry, "file", "directory" or other
+        Additional information
+        may be present, appropriate to the file-system, e.g., generation,
+        checksum, etc.
+        May use refresh=True|False to allow use of self._ls_from_cache to
+        check for a saved listing and avoid calling the backend. This would be
+        common where listing may be expensive.
+        Parameters
+        ----------
+        path: str
+        detail: bool
+            if True, gives a list of dictionaries, where each is the same as
+            the result of ``info(path)``. If False, gives a list of paths
+            (str).
+        kwargs: may have additional backend-specific options, such as version
+            information
+        Returns
+        -------
+        List of strings if detail is False, or list of directory information
+        dicts if detail is True.
+        """
+        raise NotImplementedError
+    def _ls_from_cache(self, path):
+        """Check cache for listing
+        Returns listing, if found (may be empty list for a directly that exists
+        but contains nothing), None if not in cache.
+        """
+        parent = self._parent(path)
+        try:
+            return self.dircache[path.rstrip("/")]
+        except KeyError:
+            pass
+        try:
+            files = [
+                f
+                for f in self.dircache[parent]
+                if f["name"] == path
+                or (f["name"] == path.rstrip("/") and f["type"] == "directory")
+            ]
+            if len(files) == 0:
+                # parent dir was listed but did not contain this file
+                raise FileNotFoundError(path)
+            return files
+        except KeyError:
+            pass
+    def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
+        """Return all files under the given path.
+        List all files, recursing into subdirectories; output is iterator-style,
+        like ``os.walk()``. For a simple list of files, ``find()`` is available.
+        When topdown is True, the caller can modify the dirnames list in-place (perhaps
+        using del or slice assignment), and walk() will
+        only recurse into the subdirectories whose names remain in dirnames;
+        this can be used to prune the search, impose a specific order of visiting,
+        or even to inform walk() about directories the caller creates or renames before
+        it resumes walk() again.
+        Modifying dirnames when topdown is False has no effect. (see os.walk)
+        Note that the "files" outputted will include anything that is not
+        a directory, such as links.
+        Parameters
+        ----------
+        path: str
+            Root to recurse into
+        maxdepth: int
+            Maximum recursion depth. None means limitless, but not recommended
+            on link-based file-systems.
+        topdown: bool (True)
+            Whether to walk the directory tree from the top downwards or from
+            the bottom upwards.
+        on_error: "omit", "raise", a callable
+            if omit (default), path with exception will simply be empty;
+            If raise, an underlying exception will be raised;
+            if callable, it will be called with a single OSError instance as argument
+        kwargs: passed to ``ls``
+        """
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        path = self._strip_protocol(path)
+        full_dirs = {}
+        dirs = {}
+        files = {}
+        detail = kwargs.pop("detail", False)
+        try:
+            listing = self.ls(path, detail=True, **kwargs)
+        except (FileNotFoundError, OSError) as e:
+            if on_error == "raise":
+                raise
+            if callable(on_error):
+                on_error(e)
+            return
+        for info in listing:
+            # each info name must be at least [path]/part , but here
+            # we check also for names like [path]/part/
+            pathname = info["name"].rstrip("/")
+            name = pathname.rsplit("/", 1)[-1]
+            if info["type"] == "directory" and pathname != path:
+                # do not include "self" path
+                full_dirs[name] = pathname
+                dirs[name] = info
+            elif pathname == path:
+                # file-like with same name as give path
+                files[""] = info
+            else:
+                files[name] = info
+        if not detail:
+            dirs = list(dirs)
+            files = list(files)
+        if topdown:
+            # Yield before recursion if walking top down
+            yield path, dirs, files
+        if maxdepth is not None:
+            maxdepth -= 1
+            if maxdepth < 1:
+                if not topdown:
+                    yield path, dirs, files
+                return
+        for d in dirs:
+            yield from self.walk(
+                full_dirs[d],
+                maxdepth=maxdepth,
+                detail=detail,
+                topdown=topdown,
+                **kwargs,
+            )
+        if not topdown:
+            # Yield after recursion if walking bottom up
+            yield path, dirs, files
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        """List all files below path.
+        Like posix ``find`` command without conditions
+        Parameters
+        ----------
+        path : str
+        maxdepth: int or None
+            If not None, the maximum number of levels to descend
+        withdirs: bool
+            Whether to include directory paths in the output. This is True
+            when used by glob, but users usually only want files.
+        kwargs are passed to ``ls``.
+        """
+        # TODO: allow equivalent of -name parameter
+        path = self._strip_protocol(path)
+        out = {}
+        # Add the root directory if withdirs is requested
+        # This is needed for posix glob compliance
+        if withdirs and path != "" and self.isdir(path):
+            out[path] = self.info(path)
+        for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
+            if withdirs:
+                files.update(dirs)
+            out.update({info["name"]: info for name, info in files.items()})
+        if not out and self.isfile(path):
+            # walk works on directories, but find should also return [path]
+            # when path happens to be a file
+            out[path] = {}
+        names = sorted(out)
+        if not detail:
+            return names
+        else:
+            return {name: out[name] for name in names}
+    def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
+        """Space used by files and optionally directories within a path
+        Directory size does not include the size of its contents.
+        Parameters
+        ----------
+        path: str
+        total: bool
+            Whether to sum all the file sizes
+        maxdepth: int or None
+            Maximum number of directory levels to descend, None for unlimited.
+        withdirs: bool
+            Whether to include directory paths in the output.
+        kwargs: passed to ``find``
+        Returns
+        -------
+        Dict of {path: size} if total=False, or int otherwise, where numbers
+        refer to bytes used.
+        """
+        sizes = {}
+        if withdirs and self.isdir(path):
+            # Include top-level directory in output
+            info = self.info(path)
+            sizes[info["name"]] = info["size"]
+        for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
+            info = self.info(f)
+            sizes[info["name"]] = info["size"]
+        if total:
+            return sum(sizes.values())
+        else:
+            return sizes
+    def glob(self, path, maxdepth=None, **kwargs):
+        """
+        Find files by glob-matching.
+        If the path ends with '/', only folders are returned.
+        We support ``"**"``,
+        ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.
+        The `maxdepth` option is applied on the first `**` found in the path.
+        kwargs are passed to ``ls``.
+        """
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        import re
+        seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
+        ends_with_sep = path.endswith(seps)  # _strip_protocol strips trailing slash
+        path = self._strip_protocol(path)
+        append_slash_to_dirname = ends_with_sep or path.endswith(
+            tuple(sep + "**" for sep in seps)
+        )
+        idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+        idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
+        idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+        min_idx = min(idx_star, idx_qmark, idx_brace)
+        detail = kwargs.pop("detail", False)
+        if not has_magic(path):
+            if self.exists(path, **kwargs):
+                if not detail:
+                    return [path]
+                else:
+                    return {path: self.info(path, **kwargs)}
+            else:
+                if not detail:
+                    return []  # glob of non-existent returns empty
+                else:
+                    return {}
+        elif "/" in path[:min_idx]:
+            min_idx = path[:min_idx].rindex("/")
+            root = path[: min_idx + 1]
+            depth = path[min_idx + 1 :].count("/") + 1
+        else:
+            root = ""
+            depth = path[min_idx + 1 :].count("/") + 1
+        if "**" in path:
+            if maxdepth is not None:
+                idx_double_stars = path.find("**")
+                depth_double_stars = path[idx_double_stars:].count("/") + 1
+                depth = depth - depth_double_stars + maxdepth
+            else:
+                depth = None
+        allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
+        pattern = glob_translate(path + ("/" if ends_with_sep else ""))
+        pattern = re.compile(pattern)
+        out = {
+            p: info
+            for p, info in sorted(allpaths.items())
+            if pattern.match(
+                p + "/"
+                if append_slash_to_dirname and info["type"] == "directory"
+                else p
+            )
+        }
+        if detail:
+            return out
+        else:
+            return list(out)
+    def exists(self, path, **kwargs):
+        """Is there a file at the given path"""
+        try:
+            self.info(path, **kwargs)
+            return True
+        except:  # noqa: E722
+            # any exception allowed bar FileNotFoundError?
+            return False
+    def lexists(self, path, **kwargs):
+        """If there is a file at the given path (including
+        broken links)"""
+        return self.exists(path)
+    def info(self, path, **kwargs):
+        """Give details of entry at path
+        Returns a single dictionary, with exactly the same information as ``ls``
+        would with ``detail=True``.
+        The default implementation calls ls and could be overridden by a
+        shortcut. kwargs are passed on to ```ls()``.
+        Some file systems might not be able to measure the file's size, in
+        which case, the returned dict will include ``'size': None``.
+        Returns
+        -------
+        dict with keys: name (full path in the FS), size (in bytes), type (file,
+        directory, or something else) and other FS-specific keys.
+        """
+        path = self._strip_protocol(path)
+        out = self.ls(self._parent(path), detail=True, **kwargs)
+        out = [o for o in out if o["name"].rstrip("/") == path]
+        if out:
+            return out[0]
+        out = self.ls(path, detail=True, **kwargs)
+        path = path.rstrip("/")
+        out1 = [o for o in out if o["name"].rstrip("/") == path]
+        if len(out1) == 1:
+            if "size" not in out1[0]:
+                out1[0]["size"] = None
+            return out1[0]
+        elif len(out1) > 1 or out:
+            return {"name": path, "size": 0, "type": "directory"}
+        else:
+            raise FileNotFoundError(path)
+    def checksum(self, path):
+        """Unique value for current version of file
+        If the checksum is the same from one moment to another, the contents
+        are guaranteed to be the same. If the checksum changes, the contents
+        *might* have changed.
+        This should normally be overridden; default will probably capture
+        creation/modification timestamp (which would be good) or maybe
+        access timestamp (which would be bad)
+        """
+        return int(tokenize(self.info(path)), 16)
+    def size(self, path):
+        """Size in bytes of file"""
+        return self.info(path).get("size", None)
+    def sizes(self, paths):
+        """Size in bytes of each file in a list of paths"""
+        return [self.size(p) for p in paths]
+    def isdir(self, path):
+        """Is this entry directory-like?"""
+        try:
+            return self.info(path)["type"] == "directory"
+        except OSError:
+            return False
+    def isfile(self, path):
+        """Is this entry file-like?"""
+        try:
+            return self.info(path)["type"] == "file"
+        except:  # noqa: E722
+            return False
+    def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
+        """Get the contents of the file as a string.
+        Parameters
+        ----------
+        path: str
+            URL of file on this filesystems
+        encoding, errors, newline: same as `open`.
+        """
+        with self.open(
+            path,
+            mode="r",
+            encoding=encoding,
+            errors=errors,
+            newline=newline,
+            **kwargs,
+        ) as f:
+            return f.read()
+    def write_text(
+        self, path, value, encoding=None, errors=None, newline=None, **kwargs
+    ):
+        """Write the text to the given file.
+        An existing file will be overwritten.
+        Parameters
+        ----------
+        path: str
+            URL of file on this filesystems
+        value: str
+            Text to write.
+        encoding, errors, newline: same as `open`.
+        """
+        with self.open(
+            path,
+            mode="w",
+            encoding=encoding,
+            errors=errors,
+            newline=newline,
+            **kwargs,
+        ) as f:
+            return f.write(value)
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        """Get the content of a file
+        Parameters
+        ----------
+        path: URL of file on this filesystems
+        start, end: int
+            Bytes limits of the read. If negative, backwards from end,
+            like usual python slices. Either can be None for start or
+            end of file, respectively
+        kwargs: passed to ``open()``.
+        """
+        # explicitly set buffering off?
+        with self.open(path, "rb", **kwargs) as f:
+            if start is not None:
+                if start >= 0:
+                    f.seek(start)
+                else:
+                    f.seek(max(0, f.size + start))
+            if end is not None:
+                if end < 0:
+                    end = f.size + end
+                return f.read(end - f.tell())
+            return f.read()
+    def pipe_file(self, path, value, mode="overwrite", **kwargs):
+        """Set the bytes of given file"""
+        if mode == "create" and self.exists(path):
+            # non-atomic but simple way; or could use "xb" in open(), which is likely
+            # not as well supported
+            raise FileExistsError
+        with self.open(path, "wb", **kwargs) as f:
+            f.write(value)
+    def pipe(self, path, value=None, **kwargs):
+        """Put value into path
+        (counterpart to ``cat``)
+        Parameters
+        ----------
+        path: string or dict(str, bytes)
+            If a string, a single remote location to put ``value`` bytes; if a dict,
+            a mapping of {path: bytesvalue}.
+        value: bytes, optional
+            If using a single path, these are the bytes to put there. Ignored if
+            ``path`` is a dict
+        """
+        if isinstance(path, str):
+            self.pipe_file(self._strip_protocol(path), value, **kwargs)
+        elif isinstance(path, dict):
+            for k, v in path.items():
+                self.pipe_file(self._strip_protocol(k), v, **kwargs)
+        else:
+            raise ValueError("path must be str or dict")
+    def cat_ranges(
+        self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+    ):
+        """Get the contents of byte ranges from one or more files
+        Parameters
+        ----------
+        paths: list
+            A list of of filepaths on this filesystems
+        starts, ends: int or list
+            Bytes limits of the read. If using a single int, the same value will be
+            used to read all the specified files.
+        """
+        if max_gap is not None:
+            raise NotImplementedError
+        if not isinstance(paths, list):
+            raise TypeError
+        if not isinstance(starts, list):
+            starts = [starts] * len(paths)
+        if not isinstance(ends, list):
+            ends = [ends] * len(paths)
+        if len(starts) != len(paths) or len(ends) != len(paths):
+            raise ValueError
+        out = []
+        for p, s, e in zip(paths, starts, ends):
+            try:
+                out.append(self.cat_file(p, s, e))
+            except Exception as e:
+                if on_error == "return":
+                    out.append(e)
+                else:
+                    raise
+        return out
+    def cat(self, path, recursive=False, on_error="raise", **kwargs):
+        """Fetch (potentially multiple) paths' contents
+        Parameters
+        ----------
+        recursive: bool
+            If True, assume the path(s) are directories, and get all the
+            contained files
+        on_error : "raise", "omit", "return"
+            If raise, an underlying exception will be raised (converted to KeyError
+            if the type is in self.missing_exceptions); if omit, keys with exception
+            will simply not be included in the output; if "return", all keys are
+            included in the output, but the value will be bytes or an exception
+            instance.
+        kwargs: passed to cat_file
+        Returns
+        -------
+        dict of {path: contents} if there are multiple paths
+        or the path has been otherwise expanded
+        """
+        paths = self.expand_path(path, recursive=recursive)
+        if (
+            len(paths) > 1
+            or isinstance(path, list)
+            or paths[0] != self._strip_protocol(path)
+        ):
+            out = {}
+            for path in paths:
+                try:
+                    out[path] = self.cat_file(path, **kwargs)
+                except Exception as e:
+                    if on_error == "raise":
+                        raise
+                    if on_error == "return":
+                        out[path] = e
+            return out
+        else:
+            return self.cat_file(paths[0], **kwargs)
+    def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
+        """Copy single remote file to local"""
+        from .implementations.local import LocalFileSystem
+        if isfilelike(lpath):
+            outfile = lpath
+        elif self.isdir(rpath):
+            os.makedirs(lpath, exist_ok=True)
+            return None
+        fs = LocalFileSystem(auto_mkdir=True)
+        fs.makedirs(fs._parent(lpath), exist_ok=True)
+        with self.open(rpath, "rb", **kwargs) as f1:
+            if outfile is None:
+                outfile = open(lpath, "wb")
+            try:
+                callback.set_size(getattr(f1, "size", None))
+                data = True
+                while data:
+                    data = f1.read(self.blocksize)
+                    segment_len = outfile.write(data)
+                    if segment_len is None:
+                        segment_len = len(data)
+                    callback.relative_update(segment_len)
+            finally:
+                if not isfilelike(lpath):
+                    outfile.close()
+    def get(
+        self,
+        rpath,
+        lpath,
+        recursive=False,
+        callback=DEFAULT_CALLBACK,
+        maxdepth=None,
+        **kwargs,
+    ):
+        """Copy file(s) to local.
+        Copies a specific file or tree of files (if recursive=True). If lpath
+        ends with a "/", it will be assumed to be a directory, and target files
+        will go within. Can submit a list of paths, which may be glob-patterns
+        and will be expanded.
+        Calls get_file for each source.
+        """
+        if isinstance(lpath, list) and isinstance(rpath, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            rpaths = rpath
+            lpaths = lpath
+        else:
+            from .implementations.local import (
+                LocalFileSystem,
+                make_path_posix,
+                trailing_sep,
+            )
+            source_is_str = isinstance(rpath, str)
+            rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
+                if not rpaths:
+                    return
+            if isinstance(lpath, str):
+                lpath = make_path_posix(lpath)
+            source_is_file = len(rpaths) == 1
+            dest_is_dir = isinstance(lpath, str) and (
+                trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
+            )
+            exists = source_is_str and (
+                (has_magic(rpath) and source_is_file)
+                or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
+            )
+            lpaths = other_paths(
+                rpaths,
+                lpath,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        callback.set_size(len(lpaths))
+        for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
+            with callback.branched(rpath, lpath) as child:
+                self.get_file(rpath, lpath, callback=child, **kwargs)
+    def put_file(
+        self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
+    ):
+        """Copy single file to remote"""
+        if mode == "create" and self.exists(rpath):
+            raise FileExistsError
+        if os.path.isdir(lpath):
+            self.makedirs(rpath, exist_ok=True)
+            return None
+        with open(lpath, "rb") as f1:
+            size = f1.seek(0, 2)
+            callback.set_size(size)
+            f1.seek(0)
+            self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
+            with self.open(rpath, "wb", **kwargs) as f2:
+                while f1.tell() < size:
+                    data = f1.read(self.blocksize)
+                    segment_len = f2.write(data)
+                    if segment_len is None:
+                        segment_len = len(data)
+                    callback.relative_update(segment_len)
+    def put(
+        self,
+        lpath,
+        rpath,
+        recursive=False,
+        callback=DEFAULT_CALLBACK,
+        maxdepth=None,
+        **kwargs,
+    ):
+        """Copy file(s) from local.
+        Copies a specific file or tree of files (if recursive=True). If rpath
+        ends with a "/", it will be assumed to be a directory, and target files
+        will go within.
+        Calls put_file for each source.
+        """
+        if isinstance(lpath, list) and isinstance(rpath, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            rpaths = rpath
+            lpaths = lpath
+        else:
+            from .implementations.local import (
+                LocalFileSystem,
+                make_path_posix,
+                trailing_sep,
+            )
+            source_is_str = isinstance(lpath, str)
+            if source_is_str:
+                lpath = make_path_posix(lpath)
+            fs = LocalFileSystem()
+            lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
+                if not lpaths:
+                    return
+            source_is_file = len(lpaths) == 1
+            dest_is_dir = isinstance(rpath, str) and (
+                trailing_sep(rpath) or self.isdir(rpath)
+            )
+            rpath = (
+                self._strip_protocol(rpath)
+                if isinstance(rpath, str)
+                else [self._strip_protocol(p) for p in rpath]
+            )
+            exists = source_is_str and (
+                (has_magic(lpath) and source_is_file)
+                or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
+            )
+            rpaths = other_paths(
+                lpaths,
+                rpath,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        callback.set_size(len(rpaths))
+        for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
+            with callback.branched(lpath, rpath) as child:
+                self.put_file(lpath, rpath, callback=child, **kwargs)
+    def head(self, path, size=1024):
+        """Get the first ``size`` bytes from file"""
+        with self.open(path, "rb") as f:
+            return f.read(size)
+    def tail(self, path, size=1024):
+        """Get the last ``size`` bytes from file"""
+        with self.open(path, "rb") as f:
+            f.seek(max(-size, -f.size), 2)
+            return f.read()
+    def cp_file(self, path1, path2, **kwargs):
+        raise NotImplementedError
+    def copy(
+        self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
+    ):
+        """Copy within two locations in the filesystem
+        on_error : "raise", "ignore"
+            If raise, any not-found exceptions will be raised; if ignore any
+            not-found exceptions will cause the path to be skipped; defaults to
+            raise unless recursive is true, where the default is ignore
+        """
+        if on_error is None and recursive:
+            on_error = "ignore"
+        elif on_error is None:
+            on_error = "raise"
+        if isinstance(path1, list) and isinstance(path2, list):
+            # No need to expand paths when both source and destination
+            # are provided as lists
+            paths1 = path1
+            paths2 = path2
+        else:
+            from .implementations.local import trailing_sep
+            source_is_str = isinstance(path1, str)
+            paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
+            if source_is_str and (not recursive or maxdepth is not None):
+                # Non-recursive glob does not copy directories
+                paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
+                if not paths1:
+                    return
+            source_is_file = len(paths1) == 1
+            dest_is_dir = isinstance(path2, str) and (
+                trailing_sep(path2) or self.isdir(path2)
+            )
+            exists = source_is_str and (
+                (has_magic(path1) and source_is_file)
+                or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
+            )
+            paths2 = other_paths(
+                paths1,
+                path2,
+                exists=exists,
+                flatten=not source_is_str,
+            )
+        for p1, p2 in zip(paths1, paths2):
+            try:
+                self.cp_file(p1, p2, **kwargs)
+            except FileNotFoundError:
+                if on_error == "raise":
+                    raise
+    def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
+        """Turn one or more globs or directories into a list of all matching paths
+        to files or directories.
+        kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
+        """
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        if isinstance(path, (str, os.PathLike)):
+            out = self.expand_path([path], recursive, maxdepth)
+        else:
+            out = set()
+            path = [self._strip_protocol(p) for p in path]
+            for p in path:
+                if has_magic(p):
+                    bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
+                    out |= bit
+                    if recursive:
+                        # glob call above expanded one depth so if maxdepth is defined
+                        # then decrement it in expand_path call below. If it is zero
+                        # after decrementing then avoid expand_path call.
+                        if maxdepth is not None and maxdepth <= 1:
+                            continue
+                        out |= set(
+                            self.expand_path(
+                                list(bit),
+                                recursive=recursive,
+                                maxdepth=maxdepth - 1 if maxdepth is not None else None,
+                                **kwargs,
+                            )
+                        )
+                    continue
+                elif recursive:
+                    rec = set(
+                        self.find(
+                            p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
+                        )
+                    )
+                    out |= rec
+                if p not in out and (recursive is False or self.exists(p)):
+                    # should only check once, for the root
+                    out.add(p)
+        if not out:
+            raise FileNotFoundError(path)
+        return sorted(out)
+    def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
+        """Move file(s) from one location to another"""
+        if path1 == path2:
+            logger.debug("%s mv: The paths are the same, so no files were moved.", self)
+        else:
+            # explicitly raise exception to prevent data corruption
+            self.copy(
+                path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
+            )
+            self.rm(path1, recursive=recursive)
+    def rm_file(self, path):
+        """Delete a file"""
+        self._rm(path)
+    def _rm(self, path):
+        """Delete one file"""
+        # this is the old name for the method, prefer rm_file
+        raise NotImplementedError
+    def rm(self, path, recursive=False, maxdepth=None):
+        """Delete files.
+        Parameters
+        ----------
+        path: str or list of str
+            File(s) to delete.
+        recursive: bool
+            If file(s) are directories, recursively delete contents and then
+            also remove the directory
+        maxdepth: int or None
+            Depth to pass to walk for finding files to delete, if recursive.
+            If None, there will be no limit and infinite recursion may be
+            possible.
+        """
+        path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(path):
+            self.rm_file(p)
+    @classmethod
+    def _parent(cls, path):
+        path = cls._strip_protocol(path)
+        if "/" in path:
+            parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
+            return cls.root_marker + parent
+        else:
+            return cls.root_marker
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        """Return raw bytes-mode file-like from the file-system"""
+        return AbstractBufferedFile(
+            self,
+            path,
+            mode,
+            block_size,
+            autocommit,
+            cache_options=cache_options,
+            **kwargs,
+        )
+    def open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        cache_options=None,
+        compression=None,
+        **kwargs,
+    ):
+        """
+        Return a file-like object from the filesystem
+        The resultant instance must function correctly in a context ``with``
+        block.
+        Parameters
+        ----------
+        path: str
+            Target file
+        mode: str like 'rb', 'w'
+            See builtin ``open()``
+            Mode "x" (exclusive write) may be implemented by the backend. Even if
+            it is, whether  it is checked up front or on commit, and whether it is
+            atomic is implementation-dependent.
+        block_size: int
+            Some indication of buffering - this is a value in bytes
+        cache_options : dict, optional
+            Extra arguments to pass through to the cache.
+        compression: string or None
+            If given, open file using compression codec. Can either be a compression
+            name (a key in ``fsspec.compression.compr``) or "infer" to guess the
+            compression from the filename suffix.
+        encoding, errors, newline: passed on to TextIOWrapper for text mode
+        """
+        import io
+        path = self._strip_protocol(path)
+        if "b" not in mode:
+            mode = mode.replace("t", "") + "b"
+            text_kwargs = {
+                k: kwargs.pop(k)
+                for k in ["encoding", "errors", "newline"]
+                if k in kwargs
+            }
+            return io.TextIOWrapper(
+                self.open(
+                    path,
+                    mode,
+                    block_size=block_size,
+                    cache_options=cache_options,
+                    compression=compression,
+                    **kwargs,
+                ),
+                **text_kwargs,
+            )
+        else:
+            ac = kwargs.pop("autocommit", not self._intrans)
+            f = self._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=ac,
+                cache_options=cache_options,
+                **kwargs,
+            )
+            if compression is not None:
+                from fsspec.compression import compr
+                from fsspec.core import get_compression
+                compression = get_compression(path, compression)
+                compress = compr[compression]
+                f = compress(f, mode=mode[0])
+            if not ac and "r" not in mode:
+                self.transaction.files.append(f)
+            return f
+    def touch(self, path, truncate=True, **kwargs):
+        """Create empty file, or update timestamp
+        Parameters
+        ----------
+        path: str
+            file location
+        truncate: bool
+            If True, always set file size to 0; if False, update timestamp and
+            leave file unchanged, if backend allows this
+        """
+        if truncate or not self.exists(path):
+            with self.open(path, "wb", **kwargs):
+                pass
+        else:
+            raise NotImplementedError  # update timestamp, if possible
+    def ukey(self, path):
+        """Hash of file properties, to tell if it has changed"""
+        return sha256(str(self.info(path)).encode()).hexdigest()
+    def read_block(self, fn, offset, length, delimiter=None):
+        """Read a block of bytes from
+        Starting at ``offset`` of the file, read ``length`` bytes.  If
+        ``delimiter`` is set then we ensure that the read starts and stops at
+        delimiter boundaries that follow the locations ``offset`` and ``offset
+        + length``.  If ``offset`` is zero then we start at zero.  The
+        bytestring returned WILL include the end delimiter string.
+        If offset+length is beyond the eof, reads to eof.
+        Parameters
+        ----------
+        fn: string
+            Path to filename
+        offset: int
+            Byte offset to start read
+        length: int
+            Number of bytes to read. If None, read to end.
+        delimiter: bytes (optional)
+            Ensure reading starts and stops at delimiter bytestring
+        Examples
+        --------
+        >>> fs.read_block('data/file.csv', 0, 13)  # doctest: +SKIP
+        b'Alice, 100\\nBo'
+        >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n')  # doctest: +SKIP
+        b'Alice, 100\\nBob, 200\\n'
+        Use ``length=None`` to read to the end of the file.
+        >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n')  # doctest: +SKIP
+        b'Alice, 100\\nBob, 200\\nCharlie, 300'
+        See Also
+        --------
+        :func:`fsspec.utils.read_block`
+        """
+        with self.open(fn, "rb") as f:
+            size = f.size
+            if length is None:
+                length = size
+            if size is not None and offset + length > size:
+                length = size - offset
+            return read_block(f, offset, length, delimiter)
+    def to_json(self, *, include_password: bool = True) -> str:
+        """
+        JSON representation of this filesystem instance.
+        Parameters
+        ----------
+        include_password: bool, default True
+            Whether to include the password (if any) in the output.
+        Returns
+        -------
+        JSON string with keys ``cls`` (the python location of this class),
+        protocol (text name of this class's protocol, first one in case of
+        multiple), ``args`` (positional args, usually empty), and all other
+        keyword arguments as their own keys.
+        Warnings
+        --------
+        Serialized filesystems may contain sensitive information which have been
+        passed to the constructor, such as passwords and tokens. Make sure you
+        store and send them in a secure environment!
+        """
+        from .json import FilesystemJSONEncoder
+        return json.dumps(
+            self,
+            cls=type(
+                "_FilesystemJSONEncoder",
+                (FilesystemJSONEncoder,),
+                {"include_password": include_password},
+            ),
+        )
+    @staticmethod
+    def from_json(blob: str) -> AbstractFileSystem:
+        """
+        Recreate a filesystem instance from JSON representation.
+        See ``.to_json()`` for the expected structure of the input.
+        Parameters
+        ----------
+        blob: str
+        Returns
+        -------
+        file system instance, not necessarily of this particular class.
+        Warnings
+        --------
+        This can import arbitrary modules (as determined by the ``cls`` key).
+        Make sure you haven't installed any modules that may execute malicious code
+        at import time.
+        """
+        from .json import FilesystemJSONDecoder
+        return json.loads(blob, cls=FilesystemJSONDecoder)
+    def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
+        """
+        JSON-serializable dictionary representation of this filesystem instance.
+        Parameters
+        ----------
+        include_password: bool, default True
+            Whether to include the password (if any) in the output.
+        Returns
+        -------
+        Dictionary with keys ``cls`` (the python location of this class),
+        protocol (text name of this class's protocol, first one in case of
+        multiple), ``args`` (positional args, usually empty), and all other
+        keyword arguments as their own keys.
+        Warnings
+        --------
+        Serialized filesystems may contain sensitive information which have been
+        passed to the constructor, such as passwords and tokens. Make sure you
+        store and send them in a secure environment!
+        """
+        from .json import FilesystemJSONEncoder
+        json_encoder = FilesystemJSONEncoder()
+        cls = type(self)
+        proto = self.protocol
+        storage_options = dict(self.storage_options)
+        if not include_password:
+            storage_options.pop("password", None)
+        return dict(
+            cls=f"{cls.__module__}:{cls.__name__}",
+            protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
+            args=json_encoder.make_serializable(self.storage_args),
+            **json_encoder.make_serializable(storage_options),
+        )
+    @staticmethod
+    def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
+        """
+        Recreate a filesystem instance from dictionary representation.
+        See ``.to_dict()`` for the expected structure of the input.
+        Parameters
+        ----------
+        dct: Dict[str, Any]
+        Returns
+        -------
+        file system instance, not necessarily of this particular class.
+        Warnings
+        --------
+        This can import arbitrary modules (as determined by the ``cls`` key).
+        Make sure you haven't installed any modules that may execute malicious code
+        at import time.
+        """
+        from .json import FilesystemJSONDecoder
+        json_decoder = FilesystemJSONDecoder()
+        dct = dict(dct)  # Defensive copy
+        cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
+        if cls is None:
+            raise ValueError("Not a serialized AbstractFileSystem")
+        dct.pop("cls", None)
+        dct.pop("protocol", None)
+        return cls(
+            *json_decoder.unmake_serializable(dct.pop("args", ())),
+            **json_decoder.unmake_serializable(dct),
+        )
+    def _get_pyarrow_filesystem(self):
+        """
+        Make a version of the FS instance which will be acceptable to pyarrow
+        """
+        # all instances already also derive from pyarrow
+        return self
+    def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
+        """Create key/value store based on this file-system
+        Makes a MutableMapping interface to the FS at the given root path.
+        See ``fsspec.mapping.FSMap`` for further details.
+        """
+        from .mapping import FSMap
+        return FSMap(
+            root,
+            self,
+            check=check,
+            create=create,
+            missing_exceptions=missing_exceptions,
+        )
+    @classmethod
+    def clear_instance_cache(cls):
+        """
+        Clear the cache of filesystem instances.
+        Notes
+        -----
+        Unless overridden by setting the ``cachable`` class attribute to False,
+        the filesystem class stores a reference to newly created instances. This
+        prevents Python's normal rules around garbage collection from working,
+        since the instances refcount will not drop to zero until
+        ``clear_instance_cache`` is called.
+        """
+        cls._cache.clear()
+    def created(self, path):
+        """Return the created timestamp of a file as a datetime.datetime"""
+        raise NotImplementedError
+    def modified(self, path):
+        """Return the modified timestamp of a file as a datetime.datetime"""
+        raise NotImplementedError
+    def tree(
+        self,
+        path: str = "/",
+        recursion_limit: int = 2,
+        max_display: int = 25,
+        display_size: bool = False,
+        prefix: str = "",
+        is_last: bool = True,
+        first: bool = True,
+        indent_size: int = 4,
+    ) -> str:
+        """
+        Return a tree-like structure of the filesystem starting from the given path as a string.
+        Parameters
+        ----------
+            path: Root path to start traversal from
+            recursion_limit: Maximum depth of directory traversal
+            max_display: Maximum number of items to display per directory
+            display_size: Whether to display file sizes
+            prefix: Current line prefix for visual tree structure
+            is_last: Whether current item is last in its level
+            first: Whether this is the first call (displays root path)
+            indent_size: Number of spaces by indent
+        Returns
+        -------
+            str: A string representing the tree structure.
+        Example
+        -------
+            >>> from fsspec import filesystem
+            >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
+            >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
+            >>> print(tree)
+        """
+        def format_bytes(n: int) -> str:
+            """Format bytes as text."""
+            for prefix, k in (
+                ("P", 2**50),
+                ("T", 2**40),
+                ("G", 2**30),
+                ("M", 2**20),
+                ("k", 2**10),
+            ):
+                if n >= 0.9 * k:
+                    return f"{n / k:.2f} {prefix}b"
+            return f"{n}B"
+        result = []
+        if first:
+            result.append(path)
+        if recursion_limit:
+            indent = " " * indent_size
+            contents = self.ls(path, detail=True)
+            contents.sort(
+                key=lambda x: (x.get("type") != "directory", x.get("name", ""))
+            )
+            if max_display is not None and len(contents) > max_display:
+                displayed_contents = contents[:max_display]
+                remaining_count = len(contents) - max_display
+            else:
+                displayed_contents = contents
+                remaining_count = 0
+            for i, item in enumerate(displayed_contents):
+                is_last_item = (i == len(displayed_contents) - 1) and (
+                    remaining_count == 0
+                )
+                branch = (
+                    "└" + ("─" * (indent_size - 2))
+                    if is_last_item
+                    else "├" + ("─" * (indent_size - 2))
+                )
+                branch += " "
+                new_prefix = prefix + (
+                    indent if is_last_item else "│" + " " * (indent_size - 1)
+                )
+                name = os.path.basename(item.get("name", ""))
+                if display_size and item.get("type") == "directory":
+                    sub_contents = self.ls(item.get("name", ""), detail=True)
+                    num_files = sum(
+                        1 for sub_item in sub_contents if sub_item.get("type") == "file"
+                    )
+                    num_folders = sum(
+                        1
+                        for sub_item in sub_contents
+                        if sub_item.get("type") == "directory"
+                    )
+                    if num_files == 0 and num_folders == 0:
+                        size = " (empty folder)"
+                    elif num_files == 0:
+                        size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
+                    elif num_folders == 0:
+                        size = f" ({num_files} file{'s' if num_files > 1 else ''})"
+                    else:
+                        size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
+                elif display_size and item.get("type") == "file":
+                    size = f" ({format_bytes(item.get('size', 0))})"
+                else:
+                    size = ""
+                result.append(f"{prefix}{branch}{name}{size}")
+                if item.get("type") == "directory" and recursion_limit > 0:
+                    result.append(
+                        self.tree(
+                            path=item.get("name", ""),
+                            recursion_limit=recursion_limit - 1,
+                            max_display=max_display,
+                            display_size=display_size,
+                            prefix=new_prefix,
+                            is_last=is_last_item,
+                            first=False,
+                            indent_size=indent_size,
+                        )
+                    )
+            if remaining_count > 0:
+                more_message = f"{remaining_count} more item(s) not displayed."
+                result.append(
+                    f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
+                )
+        return "\n".join(_ for _ in result if _)
+    # ------------------------------------------------------------------------
+    # Aliases
+    def read_bytes(self, path, start=None, end=None, **kwargs):
+        """Alias of `AbstractFileSystem.cat_file`."""
+        return self.cat_file(path, start=start, end=end, **kwargs)
+    def write_bytes(self, path, value, **kwargs):
+        """Alias of `AbstractFileSystem.pipe_file`."""
+        self.pipe_file(path, value, **kwargs)
+    def makedir(self, path, create_parents=True, **kwargs):
+        """Alias of `AbstractFileSystem.mkdir`."""
+        return self.mkdir(path, create_parents=create_parents, **kwargs)
+    def mkdirs(self, path, exist_ok=False):
+        """Alias of `AbstractFileSystem.makedirs`."""
+        return self.makedirs(path, exist_ok=exist_ok)
+    def listdir(self, path, detail=True, **kwargs):
+        """Alias of `AbstractFileSystem.ls`."""
+        return self.ls(path, detail=detail, **kwargs)
+    def cp(self, path1, path2, **kwargs):
+        """Alias of `AbstractFileSystem.copy`."""
+        return self.copy(path1, path2, **kwargs)
+    def move(self, path1, path2, **kwargs):
+        """Alias of `AbstractFileSystem.mv`."""
+        return self.mv(path1, path2, **kwargs)
+    def stat(self, path, **kwargs):
+        """Alias of `AbstractFileSystem.info`."""
+        return self.info(path, **kwargs)
+    def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
+        """Alias of `AbstractFileSystem.du`."""
+        return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
+    def rename(self, path1, path2, **kwargs):
+        """Alias of `AbstractFileSystem.mv`."""
+        return self.mv(path1, path2, **kwargs)
+    def delete(self, path, recursive=False, maxdepth=None):
+        """Alias of `AbstractFileSystem.rm`."""
+        return self.rm(path, recursive=recursive, maxdepth=maxdepth)
+    def upload(self, lpath, rpath, recursive=False, **kwargs):
+        """Alias of `AbstractFileSystem.put`."""
+        return self.put(lpath, rpath, recursive=recursive, **kwargs)
+    def download(self, rpath, lpath, recursive=False, **kwargs):
+        """Alias of `AbstractFileSystem.get`."""
+        return self.get(rpath, lpath, recursive=recursive, **kwargs)
+    def sign(self, path, expiration=100, **kwargs):
+        """Create a signed URL representing the given path
+        Some implementations allow temporary URLs to be generated, as a
+        way of delegating credentials.
+        Parameters
+        ----------
+        path : str
+             The path on the filesystem
+        expiration : int
+            Number of seconds to enable the URL for (if supported)
+        Returns
+        -------
+        URL : str
+            The signed URL
+        Raises
+        ------
+        NotImplementedError : if method is not implemented for a filesystem
+        """
+        raise NotImplementedError("Sign is not implemented for this filesystem")
+    def _isfilestore(self):
+        # Originally inherited from pyarrow DaskFileSystem. Keeping this
+        # here for backwards compatibility as long as pyarrow uses its
+        # legacy fsspec-compatible filesystems and thus accepts fsspec
+        # filesystems as well
+        return False
+class AbstractBufferedFile(io.IOBase):
+    """Convenient class to derive from to provide buffering
+    In the case that the backend does not provide a pythonic file-like object
+    already, this class contains much of the logic to build one. The only
+    methods that need to be overridden are ``_upload_chunk``,
+    ``_initiate_upload`` and ``_fetch_range``.
+    """
+    DEFAULT_BLOCK_SIZE = 5 * 2**20
+    _details = None
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        """
+        Template for files with buffered reading and writing
+        Parameters
+        ----------
+        fs: instance of FileSystem
+        path: str
+            location in file-system
+        mode: str
+            Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
+            systems may be read-only, and some may not support append.
+        block_size: int
+            Buffer size for reading or writing, 'default' for class default
+        autocommit: bool
+            Whether to write to final destination; may only impact what
+            happens when file is being closed.
+        cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
+            Caching policy in read mode. See the definitions in ``core``.
+        cache_options : dict
+            Additional options passed to the constructor for the cache specified
+            by `cache_type`.
+        size: int
+            If given and in read mode, suppressed having to look up the file size
+        kwargs:
+            Gets stored as self.kwargs
+        """
+        from .core import caches
+        self.path = path
+        self.fs = fs
+        self.mode = mode
+        self.blocksize = (
+            self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
+        )
+        self.loc = 0
+        self.autocommit = autocommit
+        self.end = None
+        self.start = None
+        self.closed = False
+        if cache_options is None:
+            cache_options = {}
+        if "trim" in kwargs:
+            warnings.warn(
+                "Passing 'trim' to control the cache behavior has been deprecated. "
+                "Specify it within the 'cache_options' argument instead.",
+                FutureWarning,
+            )
+            cache_options["trim"] = kwargs.pop("trim")
+        self.kwargs = kwargs
+        if mode not in {"ab", "rb", "wb", "xb"}:
+            raise NotImplementedError("File mode not supported")
+        if mode == "rb":
+            if size is not None:
+                self.size = size
+            else:
+                self.size = self.details["size"]
+            self.cache = caches[cache_type](
+                self.blocksize, self._fetch_range, self.size, **cache_options
+            )
+        else:
+            self.buffer = io.BytesIO()
+            self.offset = None
+            self.forced = False
+            self.location = None
+    @property
+    def details(self):
+        if self._details is None:
+            self._details = self.fs.info(self.path)
+        return self._details
+    @details.setter
+    def details(self, value):
+        self._details = value
+        self.size = value["size"]
+    @property
+    def full_name(self):
+        return _unstrip_protocol(self.path, self.fs)
+    @property
+    def closed(self):
+        # get around this attr being read-only in IOBase
+        # use getattr here, since this can be called during del
+        return getattr(self, "_closed", True)
+    @closed.setter
+    def closed(self, c):
+        self._closed = c
+    def __hash__(self):
+        if "w" in self.mode:
+            return id(self)
+        else:
+            return int(tokenize(self.details), 16)
+    def __eq__(self, other):
+        """Files are equal if they have the same checksum, only in read mode"""
+        if self is other:
+            return True
+        return (
+            isinstance(other, type(self))
+            and self.mode == "rb"
+            and other.mode == "rb"
+            and hash(self) == hash(other)
+        )
+    def commit(self):
+        """Move from temp to final destination"""
+    def discard(self):
+        """Throw away temporary file"""
+    def info(self):
+        """File information about this path"""
+        if self.readable():
+            return self.details
+        else:
+            raise ValueError("Info not available while writing")
+    def tell(self):
+        """Current file location"""
+        return self.loc
+    def seek(self, loc, whence=0):
+        """Set current file location
+        Parameters
+        ----------
+        loc: int
+            byte location
+        whence: {0, 1, 2}
+            from start of file, current location or end of file, resp.
+        """
+        loc = int(loc)
+        if not self.mode == "rb":
+            raise OSError(ESPIPE, "Seek only available in read mode")
+        if whence == 0:
+            nloc = loc
+        elif whence == 1:
+            nloc = self.loc + loc
+        elif whence == 2:
+            nloc = self.size + loc
+        else:
+            raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
+        if nloc < 0:
+            raise ValueError("Seek before start of file")
+        self.loc = nloc
+        return self.loc
+    def write(self, data):
+        """
+        Write data to buffer.
+        Buffer only sent on flush() or if buffer is greater than
+        or equal to blocksize.
+        Parameters
+        ----------
+        data: bytes
+            Set of bytes to be written.
+        """
+        if not self.writable():
+            raise ValueError("File not in write mode")
+        if self.closed:
+            raise ValueError("I/O operation on closed file.")
+        if self.forced:
+            raise ValueError("This file has been force-flushed, can only close")
+        out = self.buffer.write(data)
+        self.loc += out
+        if self.buffer.tell() >= self.blocksize:
+            self.flush()
+        return out
+    def flush(self, force=False):
+        """
+        Write buffered data to backend store.
+        Writes the current buffer, if it is larger than the block-size, or if
+        the file is being closed.
+        Parameters
+        ----------
+        force: bool
+            When closing, write the last block even if it is smaller than
+            blocks are allowed to be. Disallows further writing to this file.
+        """
+        if self.closed:
+            raise ValueError("Flush on closed file")
+        if force and self.forced:
+            raise ValueError("Force flush cannot be called more than once")
+        if force:
+            self.forced = True
+        if self.readable():
+            # no-op to flush on read-mode
+            return
+        if not force and self.buffer.tell() < self.blocksize:
+            # Defer write on small block
+            return
+        if self.offset is None:
+            # Initialize a multipart upload
+            self.offset = 0
+            try:
+                self._initiate_upload()
+            except:
+                self.closed = True
+                raise
+        if self._upload_chunk(final=force) is not False:
+            self.offset += self.buffer.seek(0, 2)
+            self.buffer = io.BytesIO()
+    def _upload_chunk(self, final=False):
+        """Write one part of a multi-block file upload
+        Parameters
+        ==========
+        final: bool
+            This is the last block, so should complete file, if
+            self.autocommit is True.
+        """
+        # may not yet have been initialized, may need to call _initialize_upload
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        pass
+    def _fetch_range(self, start, end):
+        """Get the specified set of bytes from remote"""
+        return self.fs.cat_file(self.path, start=start, end=end)
+    def read(self, length=-1):
+        """
+        Return data from cache, or fetch pieces as necessary
+        Parameters
+        ----------
+        length: int (-1)
+            Number of bytes to read; if <0, all remaining bytes.
+        """
+        length = -1 if length is None else int(length)
+        if self.mode != "rb":
+            raise ValueError("File not in read mode")
+        if length < 0:
+            length = self.size - self.loc
+        if self.closed:
+            raise ValueError("I/O operation on closed file.")
+        if length == 0:
+            # don't even bother calling fetch
+            return b""
+        out = self.cache._fetch(self.loc, self.loc + length)
+        logger.debug(
+            "%s read: %i - %i %s",
+            self,
+            self.loc,
+            self.loc + length,
+            self.cache._log_stats(),
+        )
+        self.loc += len(out)
+        return out
+    def readinto(self, b):
+        """mirrors builtin file's readinto method
+        https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
+        """
+        out = memoryview(b).cast("B")
+        data = self.read(out.nbytes)
+        out[: len(data)] = data
+        return len(data)
+    def readuntil(self, char=b"\n", blocks=None):
+        """Return data between current position and first occurrence of char
+        char is included in the output, except if the end of the tile is
+        encountered first.
+        Parameters
+        ----------
+        char: bytes
+            Thing to find
+        blocks: None or int
+            How much to read in each go. Defaults to file blocksize - which may
+            mean a new read on every call.
+        """
+        out = []
+        while True:
+            start = self.tell()
+            part = self.read(blocks or self.blocksize)
+            if len(part) == 0:
+                break
+            found = part.find(char)
+            if found > -1:
+                out.append(part[: found + len(char)])
+                self.seek(start + found + len(char))
+                break
+            out.append(part)
+        return b"".join(out)
+    def readline(self):
+        """Read until and including the first occurrence of newline character
+        Note that, because of character encoding, this is not necessarily a
+        true line ending.
+        """
+        return self.readuntil(b"\n")
+    def __next__(self):
+        out = self.readline()
+        if out:
+            return out
+        raise StopIteration
+    def __iter__(self):
+        return self
+    def readlines(self):
+        """Return all data, split by the newline character, including the newline character"""
+        data = self.read()
+        lines = data.split(b"\n")
+        out = [l + b"\n" for l in lines[:-1]]
+        if data.endswith(b"\n"):
+            return out
+        else:
+            return out + [lines[-1]]
+        # return list(self)  ???
+    def readinto1(self, b):
+        return self.readinto(b)
+    def close(self):
+        """Close file
+        Finalizes writes, discards cache
+        """
+        if getattr(self, "_unclosable", False):
+            return
+        if self.closed:
+            return
+        try:
+            if self.mode == "rb":
+                self.cache = None
+            else:
+                if not self.forced:
+                    self.flush(force=True)
+                if self.fs is not None:
+                    self.fs.invalidate_cache(self.path)
+                    self.fs.invalidate_cache(self.fs._parent(self.path))
+        finally:
+            self.closed = True
+    def readable(self):
+        """Whether opened for reading"""
+        return "r" in self.mode and not self.closed
+    def seekable(self):
+        """Whether is seekable (only in read mode)"""
+        return self.readable()
+    def writable(self):
+        """Whether opened for writing"""
+        return self.mode in {"wb", "ab", "xb"} and not self.closed
+    def __reduce__(self):
+        if self.mode != "rb":
+            raise RuntimeError("Pickling a writeable file is not supported")
+        return reopen, (
+            self.fs,
+            self.path,
+            self.mode,
+            self.blocksize,
+            self.loc,
+            self.size,
+            self.autocommit,
+            self.cache.name if self.cache else "none",
+            self.kwargs,
+        )
+    def __del__(self):
+        if not self.closed:
+            self.close()
+    def __str__(self):
+        return f"<File-like object {type(self.fs).__name__}, {self.path}>"
+    __repr__ = __str__
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        self.close()
+def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
+    file = fs.open(
+        path,
+        mode=mode,
+        block_size=blocksize,
+        autocommit=autocommit,
+        cache_type=cache_type,
+        size=size,
+        **kwargs,
+    )
+    if loc > 0:
+        file.seek(loc)
+    return file

.venv/lib/python3.11/site-packages/fsspec/transaction.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from collections import deque
+class Transaction:
+    """Filesystem transaction write context
+    Gathers files for deferred commit or discard, so that several write
+    operations can be finalized semi-atomically. This works by having this
+    instance as the ``.transaction`` attribute of the given filesystem
+    """
+    def __init__(self, fs, **kwargs):
+        """
+        Parameters
+        ----------
+        fs: FileSystem instance
+        """
+        self.fs = fs
+        self.files = deque()
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """End transaction and commit, if exit is not due to exception"""
+        # only commit if there was no exception
+        self.complete(commit=exc_type is None)
+        if self.fs:
+            self.fs._intrans = False
+            self.fs._transaction = None
+            self.fs = None
+    def start(self):
+        """Start a transaction on this FileSystem"""
+        self.files = deque()  # clean up after previous failed completions
+        self.fs._intrans = True
+    def complete(self, commit=True):
+        """Finish transaction: commit or discard all deferred files"""
+        while self.files:
+            f = self.files.popleft()
+            if commit:
+                f.commit()
+            else:
+                f.discard()
+        self.fs._intrans = False
+        self.fs._transaction = None
+        self.fs = None
+class FileActor:
+    def __init__(self):
+        self.files = []
+    def commit(self):
+        for f in self.files:
+            f.commit()
+        self.files.clear()
+    def discard(self):
+        for f in self.files:
+            f.discard()
+        self.files.clear()
+    def append(self, f):
+        self.files.append(f)
+class DaskTransaction(Transaction):
+    def __init__(self, fs):
+        """
+        Parameters
+        ----------
+        fs: FileSystem instance
+        """
+        import distributed
+        super().__init__(fs)
+        client = distributed.default_client()
+        self.files = client.submit(FileActor, actor=True).result()
+    def complete(self, commit=True):
+        """Finish transaction: commit or discard all deferred files"""
+        if commit:
+            self.files.commit().result()
+        else:
+            self.files.discard().result()
+        self.fs._intrans = False
+        self.fs = None

.venv/lib/python3.11/site-packages/fsspec/utils.py ADDED Viewed

	@@ -0,0 +1,739 @@

+from __future__ import annotations
+import contextlib
+import logging
+import math
+import os
+import re
+import sys
+import tempfile
+from functools import partial
+from hashlib import md5
+from importlib.metadata import version
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    Sequence,
+    TypeVar,
+)
+from urllib.parse import urlsplit
+if TYPE_CHECKING:
+    import pathlib
+    from typing_extensions import TypeGuard
+    from fsspec.spec import AbstractFileSystem
+DEFAULT_BLOCK_SIZE = 5 * 2**20
+T = TypeVar("T")
+def infer_storage_options(
+    urlpath: str, inherit_storage_options: dict[str, Any] | None = None
+) -> dict[str, Any]:
+    """Infer storage options from URL path and merge it with existing storage
+    options.
+    Parameters
+    ----------
+    urlpath: str or unicode
+        Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
+    inherit_storage_options: dict (optional)
+        Its contents will get merged with the inferred information from the
+        given path
+    Returns
+    -------
+    Storage options dict.
+    Examples
+    --------
+    >>> infer_storage_options('/mnt/datasets/test.csv')  # doctest: +SKIP
+    {"protocol": "file", "path", "/mnt/datasets/test.csv"}
+    >>> infer_storage_options(
+    ...     'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
+    ...     inherit_storage_options={'extra': 'value'},
+    ... )  # doctest: +SKIP
+    {"protocol": "hdfs", "username": "username", "password": "pwd",
+    "host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
+    "url_query": "q=1", "extra": "value"}
+    """
+    # Handle Windows paths including disk name in this special case
+    if (
+        re.match(r"^[a-zA-Z]:[\\/]", urlpath)
+        or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
+    ):
+        return {"protocol": "file", "path": urlpath}
+    parsed_path = urlsplit(urlpath)
+    protocol = parsed_path.scheme or "file"
+    if parsed_path.fragment:
+        path = "#".join([parsed_path.path, parsed_path.fragment])
+    else:
+        path = parsed_path.path
+    if protocol == "file":
+        # Special case parsing file protocol URL on Windows according to:
+        # https://msdn.microsoft.com/en-us/library/jj710207.aspx
+        windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
+        if windows_path:
+            drive, path = windows_path.groups()
+            path = f"{drive}:{path}"
+    if protocol in ["http", "https"]:
+        # for HTTP, we don't want to parse, as requests will anyway
+        return {"protocol": protocol, "path": urlpath}
+    options: dict[str, Any] = {"protocol": protocol, "path": path}
+    if parsed_path.netloc:
+        # Parse `hostname` from netloc manually because `parsed_path.hostname`
+        # lowercases the hostname which is not always desirable (e.g. in S3):
+        # https://github.com/dask/dask/issues/1417
+        options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
+        if protocol in ("s3", "s3a", "gcs", "gs"):
+            options["path"] = options["host"] + options["path"]
+        else:
+            options["host"] = options["host"]
+        if parsed_path.port:
+            options["port"] = parsed_path.port
+        if parsed_path.username:
+            options["username"] = parsed_path.username
+        if parsed_path.password:
+            options["password"] = parsed_path.password
+    if parsed_path.query:
+        options["url_query"] = parsed_path.query
+    if parsed_path.fragment:
+        options["url_fragment"] = parsed_path.fragment
+    if inherit_storage_options:
+        update_storage_options(options, inherit_storage_options)
+    return options
+def update_storage_options(
+    options: dict[str, Any], inherited: dict[str, Any] | None = None
+) -> None:
+    if not inherited:
+        inherited = {}
+    collisions = set(options) & set(inherited)
+    if collisions:
+        for collision in collisions:
+            if options.get(collision) != inherited.get(collision):
+                raise KeyError(
+                    f"Collision between inferred and specified storage "
+                    f"option:\n{collision}"
+                )
+    options.update(inherited)
+# Compression extensions registered via fsspec.compression.register_compression
+compressions: dict[str, str] = {}
+def infer_compression(filename: str) -> str | None:
+    """Infer compression, if available, from filename.
+    Infer a named compression type, if registered and available, from filename
+    extension. This includes builtin (gz, bz2, zip) compressions, as well as
+    optional compressions. See fsspec.compression.register_compression.
+    """
+    extension = os.path.splitext(filename)[-1].strip(".").lower()
+    if extension in compressions:
+        return compressions[extension]
+    return None
+def build_name_function(max_int: float) -> Callable[[int], str]:
+    """Returns a function that receives a single integer
+    and returns it as a string padded by enough zero characters
+    to align with maximum possible integer
+    >>> name_f = build_name_function(57)
+    >>> name_f(7)
+    '07'
+    >>> name_f(31)
+    '31'
+    >>> build_name_function(1000)(42)
+    '0042'
+    >>> build_name_function(999)(42)
+    '042'
+    >>> build_name_function(0)(0)
+    '0'
+    """
+    # handle corner cases max_int is 0 or exact power of 10
+    max_int += 1e-8
+    pad_length = int(math.ceil(math.log10(max_int)))
+    def name_function(i: int) -> str:
+        return str(i).zfill(pad_length)
+    return name_function
+def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
+    r"""Seek current file to file start, file end, or byte after delimiter seq.
+    Seeks file to next chunk delimiter, where chunks are defined on file start,
+    a delimiting sequence, and file end. Use file.tell() to see location afterwards.
+    Note that file start is a valid split, so must be at offset > 0 to seek for
+    delimiter.
+    Parameters
+    ----------
+    file: a file
+    delimiter: bytes
+        a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
+    blocksize: int
+        Number of bytes to read from the file at once.
+    Returns
+    -------
+    Returns True if a delimiter was found, False if at file start or end.
+    """
+    if file.tell() == 0:
+        # beginning-of-file, return without seek
+        return False
+    # Interface is for binary IO, with delimiter as bytes, but initialize last
+    # with result of file.read to preserve compatibility with text IO.
+    last: bytes | None = None
+    while True:
+        current = file.read(blocksize)
+        if not current:
+            # end-of-file without delimiter
+            return False
+        full = last + current if last else current
+        try:
+            if delimiter in full:
+                i = full.index(delimiter)
+                file.seek(file.tell() - (len(full) - i) + len(delimiter))
+                return True
+            elif len(current) < blocksize:
+                # end-of-file without delimiter
+                return False
+        except (OSError, ValueError):
+            pass
+        last = full[-len(delimiter) :]
+def read_block(
+    f: IO[bytes],
+    offset: int,
+    length: int | None,
+    delimiter: bytes | None = None,
+    split_before: bool = False,
+) -> bytes:
+    """Read a block of bytes from a file
+    Parameters
+    ----------
+    f: File
+        Open file
+    offset: int
+        Byte offset to start read
+    length: int
+        Number of bytes to read, read through end of file if None
+    delimiter: bytes (optional)
+        Ensure reading starts and stops at delimiter bytestring
+    split_before: bool (optional)
+        Start/stop read *before* delimiter bytestring.
+    If using the ``delimiter=`` keyword argument we ensure that the read
+    starts and stops at delimiter boundaries that follow the locations
+    ``offset`` and ``offset + length``.  If ``offset`` is zero then we
+    start at zero, regardless of delimiter.  The bytestring returned WILL
+    include the terminating delimiter string.
+    Examples
+    --------
+    >>> from io import BytesIO  # doctest: +SKIP
+    >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300')  # doctest: +SKIP
+    >>> read_block(f, 0, 13)  # doctest: +SKIP
+    b'Alice, 100\\nBo'
+    >>> read_block(f, 0, 13, delimiter=b'\\n')  # doctest: +SKIP
+    b'Alice, 100\\nBob, 200\\n'
+    >>> read_block(f, 10, 10, delimiter=b'\\n')  # doctest: +SKIP
+    b'Bob, 200\\nCharlie, 300'
+    """
+    if delimiter:
+        f.seek(offset)
+        found_start_delim = seek_delimiter(f, delimiter, 2**16)
+        if length is None:
+            return f.read()
+        start = f.tell()
+        length -= start - offset
+        f.seek(start + length)
+        found_end_delim = seek_delimiter(f, delimiter, 2**16)
+        end = f.tell()
+        # Adjust split location to before delimiter if seek found the
+        # delimiter sequence, not start or end of file.
+        if found_start_delim and split_before:
+            start -= len(delimiter)
+        if found_end_delim and split_before:
+            end -= len(delimiter)
+        offset = start
+        length = end - start
+    f.seek(offset)
+    # TODO: allow length to be None and read to the end of the file?
+    assert length is not None
+    b = f.read(length)
+    return b
+def tokenize(*args: Any, **kwargs: Any) -> str:
+    """Deterministic token
+    (modified from dask.base)
+    >>> tokenize([1, 2, '3'])
+    '9d71491b50023b06fc76928e6eddb952'
+    >>> tokenize('Hello') == tokenize('Hello')
+    True
+    """
+    if kwargs:
+        args += (kwargs,)
+    try:
+        h = md5(str(args).encode())
+    except ValueError:
+        # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
+        h = md5(str(args).encode(), usedforsecurity=False)
+    return h.hexdigest()
+def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
+    """Attempt to convert a path-like object to a string.
+    Parameters
+    ----------
+    filepath: object to be converted
+    Returns
+    -------
+    filepath_str: maybe a string version of the object
+    Notes
+    -----
+    Objects supporting the fspath protocol are coerced according to its
+    __fspath__ method.
+    For backwards compatibility with older Python version, pathlib.Path
+    objects are specially coerced.
+    Any other object is passed through unchanged, which includes bytes,
+    strings, buffers, or anything else that's not even path-like.
+    """
+    if isinstance(filepath, str):
+        return filepath
+    elif hasattr(filepath, "__fspath__"):
+        return filepath.__fspath__()
+    elif hasattr(filepath, "path"):
+        return filepath.path
+    else:
+        return filepath  # type: ignore[return-value]
+def make_instance(
+    cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
+) -> T:
+    inst = cls(*args, **kwargs)
+    inst._determine_worker()  # type: ignore[attr-defined]
+    return inst
+def common_prefix(paths: Iterable[str]) -> str:
+    """For a list of paths, find the shortest prefix common to all"""
+    parts = [p.split("/") for p in paths]
+    lmax = min(len(p) for p in parts)
+    end = 0
+    for i in range(lmax):
+        end = all(p[i] == parts[0][i] for p in parts)
+        if not end:
+            break
+    i += end
+    return "/".join(parts[0][:i])
+def other_paths(
+    paths: list[str],
+    path2: str | list[str],
+    exists: bool = False,
+    flatten: bool = False,
+) -> list[str]:
+    """In bulk file operations, construct a new file tree from a list of files
+    Parameters
+    ----------
+    paths: list of str
+        The input file tree
+    path2: str or list of str
+        Root to construct the new list in. If this is already a list of str, we just
+        assert it has the right number of elements.
+    exists: bool (optional)
+        For a str destination, it is already exists (and is a dir), files should
+        end up inside.
+    flatten: bool (optional)
+        Whether to flatten the input directory tree structure so that the output files
+        are in the same directory.
+    Returns
+    -------
+    list of str
+    """
+    if isinstance(path2, str):
+        path2 = path2.rstrip("/")
+        if flatten:
+            path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
+        else:
+            cp = common_prefix(paths)
+            if exists:
+                cp = cp.rsplit("/", 1)[0]
+            if not cp and all(not s.startswith("/") for s in paths):
+                path2 = ["/".join([path2, p]) for p in paths]
+            else:
+                path2 = [p.replace(cp, path2, 1) for p in paths]
+    else:
+        assert len(paths) == len(path2)
+    return path2
+def is_exception(obj: Any) -> bool:
+    return isinstance(obj, BaseException)
+def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
+    return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
+def get_protocol(url: str) -> str:
+    url = stringify_path(url)
+    parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
+    if len(parts) > 1:
+        return parts[0]
+    return "file"
+def can_be_local(path: str) -> bool:
+    """Can the given URL be used with open_local?"""
+    from fsspec import get_filesystem_class
+    try:
+        return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
+    except (ValueError, ImportError):
+        # not in registry or import failed
+        return False
+def get_package_version_without_import(name: str) -> str | None:
+    """For given package name, try to find the version without importing it
+    Import and package.__version__ is still the backup here, so an import
+    *might* happen.
+    Returns either the version string, or None if the package
+    or the version was not readily  found.
+    """
+    if name in sys.modules:
+        mod = sys.modules[name]
+        if hasattr(mod, "__version__"):
+            return mod.__version__
+    try:
+        return version(name)
+    except:  # noqa: E722
+        pass
+    try:
+        import importlib
+        mod = importlib.import_module(name)
+        return mod.__version__
+    except (ImportError, AttributeError):
+        return None
+def setup_logging(
+    logger: logging.Logger | None = None,
+    logger_name: str | None = None,
+    level: str = "DEBUG",
+    clear: bool = True,
+) -> logging.Logger:
+    if logger is None and logger_name is None:
+        raise ValueError("Provide either logger object or logger name")
+    logger = logger or logging.getLogger(logger_name)
+    handle = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
+    )
+    handle.setFormatter(formatter)
+    if clear:
+        logger.handlers.clear()
+    logger.addHandler(handle)
+    logger.setLevel(level)
+    return logger
+def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
+    return fs.unstrip_protocol(name)
+def mirror_from(
+    origin_name: str, methods: Iterable[str]
+) -> Callable[[type[T]], type[T]]:
+    """Mirror attributes and methods from the given
+    origin_name attribute of the instance to the
+    decorated class"""
+    def origin_getter(method: str, self: Any) -> Any:
+        origin = getattr(self, origin_name)
+        return getattr(origin, method)
+    def wrapper(cls: type[T]) -> type[T]:
+        for method in methods:
+            wrapped_method = partial(origin_getter, method)
+            setattr(cls, method, property(wrapped_method))
+        return cls
+    return wrapper
+@contextlib.contextmanager
+def nullcontext(obj: T) -> Iterator[T]:
+    yield obj
+def merge_offset_ranges(
+    paths: list[str],
+    starts: list[int] | int,
+    ends: list[int] | int,
+    max_gap: int = 0,
+    max_block: int | None = None,
+    sort: bool = True,
+) -> tuple[list[str], list[int], list[int]]:
+    """Merge adjacent byte-offset ranges when the inter-range
+    gap is <= `max_gap`, and when the merged byte range does not
+    exceed `max_block` (if specified). By default, this function
+    will re-order the input paths and byte ranges to ensure sorted
+    order. If the user can guarantee that the inputs are already
+    sorted, passing `sort=False` will skip the re-ordering.
+    """
+    # Check input
+    if not isinstance(paths, list):
+        raise TypeError
+    if not isinstance(starts, list):
+        starts = [starts] * len(paths)
+    if not isinstance(ends, list):
+        ends = [ends] * len(paths)
+    if len(starts) != len(paths) or len(ends) != len(paths):
+        raise ValueError
+    # Early Return
+    if len(starts) <= 1:
+        return paths, starts, ends
+    starts = [s or 0 for s in starts]
+    # Sort by paths and then ranges if `sort=True`
+    if sort:
+        paths, starts, ends = (
+            list(v)
+            for v in zip(
+                *sorted(
+                    zip(paths, starts, ends),
+                )
+            )
+        )
+    if paths:
+        # Loop through the coupled `paths`, `starts`, and
+        # `ends`, and merge adjacent blocks when appropriate
+        new_paths = paths[:1]
+        new_starts = starts[:1]
+        new_ends = ends[:1]
+        for i in range(1, len(paths)):
+            if paths[i] == paths[i - 1] and new_ends[-1] is None:
+                continue
+            elif (
+                paths[i] != paths[i - 1]
+                or ((starts[i] - new_ends[-1]) > max_gap)
+                or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
+            ):
+                # Cannot merge with previous block.
+                # Add new `paths`, `starts`, and `ends` elements
+                new_paths.append(paths[i])
+                new_starts.append(starts[i])
+                new_ends.append(ends[i])
+            else:
+                # Merge with previous block by updating the
+                # last element of `ends`
+                new_ends[-1] = ends[i]
+        return new_paths, new_starts, new_ends
+    # `paths` is empty. Just return input lists
+    return paths, starts, ends
+def file_size(filelike: IO[bytes]) -> int:
+    """Find length of any open read-mode file-like"""
+    pos = filelike.tell()
+    try:
+        return filelike.seek(0, 2)
+    finally:
+        filelike.seek(pos)
+@contextlib.contextmanager
+def atomic_write(path: str, mode: str = "wb"):
+    """
+    A context manager that opens a temporary file next to `path` and, on exit,
+    replaces `path` with the temporary file, thereby updating `path`
+    atomically.
+    """
+    fd, fn = tempfile.mkstemp(
+        dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
+    )
+    try:
+        with open(fd, mode) as fp:
+            yield fp
+    except BaseException:
+        with contextlib.suppress(FileNotFoundError):
+            os.unlink(fn)
+        raise
+    else:
+        os.replace(fn, path)
+def _translate(pat, STAR, QUESTION_MARK):
+    # Copied from: https://github.com/python/cpython/pull/106703.
+    res: list[str] = []
+    add = res.append
+    i, n = 0, len(pat)
+    while i < n:
+        c = pat[i]
+        i = i + 1
+        if c == "*":
+            # compress consecutive `*` into one
+            if (not res) or res[-1] is not STAR:
+                add(STAR)
+        elif c == "?":
+            add(QUESTION_MARK)
+        elif c == "[":
+            j = i
+            if j < n and pat[j] == "!":
+                j = j + 1
+            if j < n and pat[j] == "]":
+                j = j + 1
+            while j < n and pat[j] != "]":
+                j = j + 1
+            if j >= n:
+                add("\\[")
+            else:
+                stuff = pat[i:j]
+                if "-" not in stuff:
+                    stuff = stuff.replace("\\", r"\\")
+                else:
+                    chunks = []
+                    k = i + 2 if pat[i] == "!" else i + 1
+                    while True:
+                        k = pat.find("-", k, j)
+                        if k < 0:
+                            break
+                        chunks.append(pat[i:k])
+                        i = k + 1
+                        k = k + 3
+                    chunk = pat[i:j]
+                    if chunk:
+                        chunks.append(chunk)
+                    else:
+                        chunks[-1] += "-"
+                    # Remove empty ranges -- invalid in RE.
+                    for k in range(len(chunks) - 1, 0, -1):
+                        if chunks[k - 1][-1] > chunks[k][0]:
+                            chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
+                            del chunks[k]
+                    # Escape backslashes and hyphens for set difference (--).
+                    # Hyphens that create ranges shouldn't be escaped.
+                    stuff = "-".join(
+                        s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
+                    )
+                # Escape set operations (&&, ~~ and ||).
+                stuff = re.sub(r"([&~|])", r"\\\1", stuff)
+                i = j + 1
+                if not stuff:
+                    # Empty range: never match.
+                    add("(?!)")
+                elif stuff == "!":
+                    # Negated empty range: match any character.
+                    add(".")
+                else:
+                    if stuff[0] == "!":
+                        stuff = "^" + stuff[1:]
+                    elif stuff[0] in ("^", "["):
+                        stuff = "\\" + stuff
+                    add(f"[{stuff}]")
+        else:
+            add(re.escape(c))
+    assert i == n
+    return res
+def glob_translate(pat):
+    # Copied from: https://github.com/python/cpython/pull/106703.
+    # The keyword parameters' values are fixed to:
+    # recursive=True, include_hidden=True, seps=None
+    """Translate a pathname with shell wildcards to a regular expression."""
+    if os.path.altsep:
+        seps = os.path.sep + os.path.altsep
+    else:
+        seps = os.path.sep
+    escaped_seps = "".join(map(re.escape, seps))
+    any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
+    not_sep = f"[^{escaped_seps}]"
+    one_last_segment = f"{not_sep}+"
+    one_segment = f"{one_last_segment}{any_sep}"
+    any_segments = f"(?:.+{any_sep})?"
+    any_last_segments = ".*"
+    results = []
+    parts = re.split(any_sep, pat)
+    last_part_idx = len(parts) - 1
+    for idx, part in enumerate(parts):
+        if part == "*":
+            results.append(one_segment if idx < last_part_idx else one_last_segment)
+            continue
+        if part == "**":
+            results.append(any_segments if idx < last_part_idx else any_last_segments)
+            continue
+        elif "**" in part:
+            raise ValueError(
+                "Invalid pattern: '**' can only be an entire path component"
+            )
+        if part:
+            results.extend(_translate(part, f"{not_sep}*", not_sep))
+        if idx < last_part_idx:
+            results.append(any_sep)
+    res = "".join(results)
+    return rf"(?s:{res})\Z"

.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (907 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/_src/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (187 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.aot_autograd import (
+    aot_autograd_decompositions,
+    KNOWN_TYPES,
+    PytreeThunk,
+)

.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (375 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.eager_transforms import (
+    _assert_wrapped_functional,
+    _unwrap_functional_tensor,
+)

.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (364 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.make_functional import _swap_state

.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (287 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.vmap import (
+    _add_batch_dim,
+    _broadcast_to_and_flatten,
+    _create_batched_inputs,
+    _get_name,
+    _process_batched_inputs,
+    _remove_batch_dim,
+    _unwrap_batched,
+    _validate_and_get_batch_size,
+    Tensor,
+    tree_flatten,
+    tree_unflatten,
+)

.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (663 Bytes). View file

.venv/lib/python3.11/site-packages/functorch/compile/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from torch._functorch import config
+from torch._functorch.aot_autograd import (
+    aot_function,
+    aot_module,
+    aot_module_simplified,
+    compiled_function,
+    compiled_module,
+    get_aot_compilation_context,
+    get_aot_graph_name,
+    get_graph_being_compiled,
+    make_boxed_compiler,
+    make_boxed_func,
+)
+from torch._functorch.compilers import (
+    debug_compile,
+    default_decompositions,
+    draw_graph_compile,
+    memory_efficient_fusion,
+    nnc_jit,
+    nop,
+    print_compile,
+    ts_compile,
+)
+from torch._functorch.fx_minifier import minifier
+from torch._functorch.partitioners import (
+    default_partition,
+    draw_graph,
+    min_cut_rematerialization_partition,
+)
+from torch._functorch.python_key import pythonkey_decompose

.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.39 kB). View file

.venv/lib/python3.11/site-packages/functorch/dim/__init__.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import dis
+import inspect
+from typing import Sequence, Union
+import functorch._C
+import torch
+from functorch._C import dim as _C
+from .tree_map import tree_flatten, tree_map
+from .wrap_type import wrap_type
+_C._patch_tensor_class()
+dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
+class DimensionMismatchError(Exception):
+    pass
+class DimensionBindError(Exception):
+    pass
+from . import op_properties
+# use dict to avoid writing C++ bindings for set
+pointwise = dict.fromkeys(op_properties.pointwise, True)
+use_c = True
+if not use_c:
+    from . import reference
+class _Tensor:
+    # fast path around slow wrapping/unwrapping logic for simply queries used
+    # by the implementation...
+    @property
+    def dims(self):
+        return tuple(d for d in self._levels if isinstance(d, Dim))
+    def dim(self):
+        return self.ndim
+    if use_c:
+        __torch_function__ = classmethod(_C.__torch_function__)
+        expand = _C._instancemethod(_C.expand)
+    else:
+        __torch_function__ = reference.__torch_function__
+        expand = reference.expand
+    index = _C._instancemethod(_C.index)
+    def __repr__(self):
+        tensor, levels, ndim = self._tensor, self._levels, self.ndim
+        return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
+TensorLike = (_Tensor, torch.Tensor)
+class Dim(_C.Dim, _Tensor):
+    # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precendence.
+    # Tensor defines format, but we want to print Dims with special formatting
+    __format__ = object.__format__
+class Tensor(_Tensor, _C.Tensor):
+    if not use_c:
+        from_batched = staticmethod(_C.Tensor_from_batched)
+    from_positional = staticmethod(_C.Tensor_from_positional)
+    sum = _C._instancemethod(_C.Tensor_sum)
+def cat(tensors, dim, new_dim):
+    n = dims()
+    return stack(tensors, n, dim).index([n, dim], new_dim)
+if use_c:
+    _wrap = _C._wrap
+    def _def(name, *args, **kwargs):
+        orig = getattr(torch.Tensor, name)
+        setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
+    t__getitem__ = _C._instancemethod(_C.__getitem__)
+    stack = _C.stack
+    split = _C._instancemethod(_C.split)
+else:
+    _wrap, _def = reference._wrap, reference._def
+    t__getitem__ = reference.t__getitem__
+    stack = reference.stack
+    split = reference.split
+# note: there is no python reference
+t__setitem__ = _C._instancemethod(_C.__setitem__)
+# this is patched in the C API because otherwise torch.Tensor will
+# no longer be considered a sequence and things will break
+# torch.Tensor.__getitem__ = t__getitem__
+_Tensor.__getitem__ = t__getitem__
+# torch.Tensor.__setitem__ = t__setitem__
+_Tensor.__setitem__ = t__setitem__
+torch.Tensor.split = split
+_Tensor.split = split
+torch.Tensor.expand = _C._instancemethod(_C.expand)
+torch.Tensor.index = _C._instancemethod(_C.index)
+wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__)
+del _Tensor.ndim
+if use_c:
+    _Tensor.order = _C._instancemethod(_C.order)
+else:
+    _Tensor.order = reference.positional
+_def("mean")
+_def("sum")
+_def("all")
+_def("amax")
+_def("amin")
+_def("aminmax")
+_def("any")
+_def("count_nonzero")
+_def("logsumexp")
+_def("nanmean")
+_def("nansum")
+_def("prod")
+_def("std", keepdim_offset=2)
+_def("var", keepdim_offset=2)
+_def("max", single_dim=True)
+_def("min", single_dim=True)
+_def("argmax", single_dim=True)
+_def("argmin", single_dim=True)
+_def("kthvalue", single_dim=True)
+_def("median", single_dim=True)
+_def("nanmedian", single_dim=True)
+_def("mode", single_dim=True)
+_def("sort", reduce=False)
+_def("argsort", reduce=False)
+_def("unbind", single_dim=True)
+_def("chunk", dim_offset=1, reduce=False)
+_def("cummax", single_dim=True, reduce=False)
+_def("cummin", single_dim=True, reduce=False)
+_def("cumprod", single_dim=True, reduce=False)
+_def("cumprod_", single_dim=True, reduce=False)
+_def("cumsum", single_dim=True, reduce=False)
+_def("cumsum_", single_dim=True, reduce=False)
+_def("logcumsumexp", single_dim=True, reduce=False)
+_def("renorm", dim_offset=1, single_dim=True, reduce=False)
+_def("softmax", single_dim=True, reduce=False)
+softmax = _wrap(torch.nn.functional.softmax, single_dim=True, reduce=False)
+# stuff to handle in the future, because they require special
+# binding logic for dims
+# cross
+# diag_embed
+# diagonal
+# diagonal_scatter
+# diff
+# nanquantile
+# quantile
+# roll
+# rot90
+# topk (new dimes on output)
+# should these all be subsumed by inplace indexing?
+# index_add_
+# index_add
+# index_copy
+# index_copy_
+# index_fill
+# index_fill_
+# index_select
+# scatter
+# scatter_
+# scatter_add
+# scatter_add_
+# scatter_reduce

.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (7.86 kB). View file

.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc ADDED Viewed

Binary file (1.25 kB). View file

.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc ADDED Viewed

Binary file (5.57 kB). View file

.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc ADDED Viewed

Binary file (7.06 kB). View file