Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/fsspec/__init__.py +69 -0
- .venv/lib/python3.11/site-packages/fsspec/_version.py +16 -0
- .venv/lib/python3.11/site-packages/fsspec/archive.py +75 -0
- .venv/lib/python3.11/site-packages/fsspec/asyn.py +1098 -0
- .venv/lib/python3.11/site-packages/fsspec/caching.py +966 -0
- .venv/lib/python3.11/site-packages/fsspec/callbacks.py +324 -0
- .venv/lib/python3.11/site-packages/fsspec/compression.py +175 -0
- .venv/lib/python3.11/site-packages/fsspec/config.py +131 -0
- .venv/lib/python3.11/site-packages/fsspec/conftest.py +55 -0
- .venv/lib/python3.11/site-packages/fsspec/core.py +743 -0
- .venv/lib/python3.11/site-packages/fsspec/dircache.py +98 -0
- .venv/lib/python3.11/site-packages/fsspec/exceptions.py +18 -0
- .venv/lib/python3.11/site-packages/fsspec/fuse.py +324 -0
- .venv/lib/python3.11/site-packages/fsspec/generic.py +411 -0
- .venv/lib/python3.11/site-packages/fsspec/gui.py +416 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py +304 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/dask.py +152 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py +467 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py +384 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py +124 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/local.py +476 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/reference.py +1306 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py +180 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/tar.py +124 -0
- .venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py +485 -0
- .venv/lib/python3.11/site-packages/fsspec/json.py +121 -0
- .venv/lib/python3.11/site-packages/fsspec/mapping.py +251 -0
- .venv/lib/python3.11/site-packages/fsspec/parquet.py +541 -0
- .venv/lib/python3.11/site-packages/fsspec/registry.py +315 -0
- .venv/lib/python3.11/site-packages/fsspec/spec.py +2242 -0
- .venv/lib/python3.11/site-packages/fsspec/transaction.py +90 -0
- .venv/lib/python3.11/site-packages/fsspec/utils.py +739 -0
- .venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py +8 -0
- .venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py +7 -0
- .venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py +4 -0
- .venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py +16 -0
- .venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/compile/__init__.py +30 -0
- .venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/dim/__init__.py +181 -0
- .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/fsspec/__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from importlib.metadata import entry_points
|
| 2 |
+
|
| 3 |
+
from . import caching
|
| 4 |
+
from ._version import __version__ # noqa: F401
|
| 5 |
+
from .callbacks import Callback
|
| 6 |
+
from .compression import available_compressions
|
| 7 |
+
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
|
| 8 |
+
from .exceptions import FSTimeoutError
|
| 9 |
+
from .mapping import FSMap, get_mapper
|
| 10 |
+
from .registry import (
|
| 11 |
+
available_protocols,
|
| 12 |
+
filesystem,
|
| 13 |
+
get_filesystem_class,
|
| 14 |
+
register_implementation,
|
| 15 |
+
registry,
|
| 16 |
+
)
|
| 17 |
+
from .spec import AbstractFileSystem
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"AbstractFileSystem",
|
| 21 |
+
"FSTimeoutError",
|
| 22 |
+
"FSMap",
|
| 23 |
+
"filesystem",
|
| 24 |
+
"register_implementation",
|
| 25 |
+
"get_filesystem_class",
|
| 26 |
+
"get_fs_token_paths",
|
| 27 |
+
"get_mapper",
|
| 28 |
+
"open",
|
| 29 |
+
"open_files",
|
| 30 |
+
"open_local",
|
| 31 |
+
"registry",
|
| 32 |
+
"caching",
|
| 33 |
+
"Callback",
|
| 34 |
+
"available_protocols",
|
| 35 |
+
"available_compressions",
|
| 36 |
+
"url_to_fs",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def process_entries():
|
| 41 |
+
if entry_points is not None:
|
| 42 |
+
try:
|
| 43 |
+
eps = entry_points()
|
| 44 |
+
except TypeError:
|
| 45 |
+
pass # importlib-metadata < 0.8
|
| 46 |
+
else:
|
| 47 |
+
if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
|
| 48 |
+
specs = eps.select(group="fsspec.specs")
|
| 49 |
+
else:
|
| 50 |
+
specs = eps.get("fsspec.specs", [])
|
| 51 |
+
registered_names = {}
|
| 52 |
+
for spec in specs:
|
| 53 |
+
err_msg = f"Unable to load filesystem from {spec}"
|
| 54 |
+
name = spec.name
|
| 55 |
+
if name in registered_names:
|
| 56 |
+
continue
|
| 57 |
+
registered_names[name] = True
|
| 58 |
+
register_implementation(
|
| 59 |
+
name,
|
| 60 |
+
spec.value.replace(":", "."),
|
| 61 |
+
errtxt=err_msg,
|
| 62 |
+
# We take our implementations as the ones to overload with if
|
| 63 |
+
# for some reason we encounter some, may be the same, already
|
| 64 |
+
# registered
|
| 65 |
+
clobber=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
process_entries()
|
.venv/lib/python3.11/site-packages/fsspec/_version.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# file generated by setuptools_scm
|
| 2 |
+
# don't change, don't track in version control
|
| 3 |
+
TYPE_CHECKING = False
|
| 4 |
+
if TYPE_CHECKING:
|
| 5 |
+
from typing import Tuple, Union
|
| 6 |
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
| 7 |
+
else:
|
| 8 |
+
VERSION_TUPLE = object
|
| 9 |
+
|
| 10 |
+
version: str
|
| 11 |
+
__version__: str
|
| 12 |
+
__version_tuple__: VERSION_TUPLE
|
| 13 |
+
version_tuple: VERSION_TUPLE
|
| 14 |
+
|
| 15 |
+
__version__ = version = '2025.2.0'
|
| 16 |
+
__version_tuple__ = version_tuple = (2025, 2, 0)
|
.venv/lib/python3.11/site-packages/fsspec/archive.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import operator
|
| 2 |
+
|
| 3 |
+
from fsspec import AbstractFileSystem
|
| 4 |
+
from fsspec.utils import tokenize
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class AbstractArchiveFileSystem(AbstractFileSystem):
|
| 8 |
+
"""
|
| 9 |
+
A generic superclass for implementing Archive-based filesystems.
|
| 10 |
+
|
| 11 |
+
Currently, it is shared amongst
|
| 12 |
+
:class:`~fsspec.implementations.zip.ZipFileSystem`,
|
| 13 |
+
:class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
|
| 14 |
+
:class:`~fsspec.implementations.tar.TarFileSystem`.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __str__(self):
|
| 18 |
+
return f"<Archive-like object {type(self).__name__} at {id(self)}>"
|
| 19 |
+
|
| 20 |
+
__repr__ = __str__
|
| 21 |
+
|
| 22 |
+
def ukey(self, path):
|
| 23 |
+
return tokenize(path, self.fo, self.protocol)
|
| 24 |
+
|
| 25 |
+
def _all_dirnames(self, paths):
|
| 26 |
+
"""Returns *all* directory names for each path in paths, including intermediate
|
| 27 |
+
ones.
|
| 28 |
+
|
| 29 |
+
Parameters
|
| 30 |
+
----------
|
| 31 |
+
paths: Iterable of path strings
|
| 32 |
+
"""
|
| 33 |
+
if len(paths) == 0:
|
| 34 |
+
return set()
|
| 35 |
+
|
| 36 |
+
dirnames = {self._parent(path) for path in paths} - {self.root_marker}
|
| 37 |
+
return dirnames | self._all_dirnames(dirnames)
|
| 38 |
+
|
| 39 |
+
def info(self, path, **kwargs):
|
| 40 |
+
self._get_dirs()
|
| 41 |
+
path = self._strip_protocol(path)
|
| 42 |
+
if path in {"", "/"} and self.dir_cache:
|
| 43 |
+
return {"name": "", "type": "directory", "size": 0}
|
| 44 |
+
if path in self.dir_cache:
|
| 45 |
+
return self.dir_cache[path]
|
| 46 |
+
elif path + "/" in self.dir_cache:
|
| 47 |
+
return self.dir_cache[path + "/"]
|
| 48 |
+
else:
|
| 49 |
+
raise FileNotFoundError(path)
|
| 50 |
+
|
| 51 |
+
def ls(self, path, detail=True, **kwargs):
|
| 52 |
+
self._get_dirs()
|
| 53 |
+
paths = {}
|
| 54 |
+
for p, f in self.dir_cache.items():
|
| 55 |
+
p = p.rstrip("/")
|
| 56 |
+
if "/" in p:
|
| 57 |
+
root = p.rsplit("/", 1)[0]
|
| 58 |
+
else:
|
| 59 |
+
root = ""
|
| 60 |
+
if root == path.rstrip("/"):
|
| 61 |
+
paths[p] = f
|
| 62 |
+
elif all(
|
| 63 |
+
(a == b)
|
| 64 |
+
for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
|
| 65 |
+
):
|
| 66 |
+
# root directory entry
|
| 67 |
+
ppath = p.rstrip("/").split("/", 1)[0]
|
| 68 |
+
if ppath not in paths:
|
| 69 |
+
out = {"name": ppath, "size": 0, "type": "directory"}
|
| 70 |
+
paths[ppath] = out
|
| 71 |
+
if detail:
|
| 72 |
+
out = sorted(paths.values(), key=operator.itemgetter("name"))
|
| 73 |
+
return out
|
| 74 |
+
else:
|
| 75 |
+
return sorted(paths)
|
.venv/lib/python3.11/site-packages/fsspec/asyn.py
ADDED
|
@@ -0,0 +1,1098 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import asyncio.events
|
| 3 |
+
import functools
|
| 4 |
+
import inspect
|
| 5 |
+
import io
|
| 6 |
+
import numbers
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import threading
|
| 10 |
+
from contextlib import contextmanager
|
| 11 |
+
from glob import has_magic
|
| 12 |
+
from typing import TYPE_CHECKING, Iterable
|
| 13 |
+
|
| 14 |
+
from .callbacks import DEFAULT_CALLBACK
|
| 15 |
+
from .exceptions import FSTimeoutError
|
| 16 |
+
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
|
| 17 |
+
from .spec import AbstractBufferedFile, AbstractFileSystem
|
| 18 |
+
from .utils import glob_translate, is_exception, other_paths
|
| 19 |
+
|
| 20 |
+
private = re.compile("_[^_]")
|
| 21 |
+
iothread = [None] # dedicated fsspec IO thread
|
| 22 |
+
loop = [None] # global event loop for any non-async instance
|
| 23 |
+
_lock = None # global lock placeholder
|
| 24 |
+
get_running_loop = asyncio.get_running_loop
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_lock():
|
| 28 |
+
"""Allocate or return a threading lock.
|
| 29 |
+
|
| 30 |
+
The lock is allocated on first use to allow setting one lock per forked process.
|
| 31 |
+
"""
|
| 32 |
+
global _lock
|
| 33 |
+
if not _lock:
|
| 34 |
+
_lock = threading.Lock()
|
| 35 |
+
return _lock
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def reset_lock():
|
| 39 |
+
"""Reset the global lock.
|
| 40 |
+
|
| 41 |
+
This should be called only on the init of a forked process to reset the lock to
|
| 42 |
+
None, enabling the new forked process to get a new lock.
|
| 43 |
+
"""
|
| 44 |
+
global _lock
|
| 45 |
+
|
| 46 |
+
iothread[0] = None
|
| 47 |
+
loop[0] = None
|
| 48 |
+
_lock = None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
async def _runner(event, coro, result, timeout=None):
|
| 52 |
+
timeout = timeout if timeout else None # convert 0 or 0.0 to None
|
| 53 |
+
if timeout is not None:
|
| 54 |
+
coro = asyncio.wait_for(coro, timeout=timeout)
|
| 55 |
+
try:
|
| 56 |
+
result[0] = await coro
|
| 57 |
+
except Exception as ex:
|
| 58 |
+
result[0] = ex
|
| 59 |
+
finally:
|
| 60 |
+
event.set()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def sync(loop, func, *args, timeout=None, **kwargs):
|
| 64 |
+
"""
|
| 65 |
+
Make loop run coroutine until it returns. Runs in other thread
|
| 66 |
+
|
| 67 |
+
Examples
|
| 68 |
+
--------
|
| 69 |
+
>>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args,
|
| 70 |
+
timeout=timeout, **kwargs)
|
| 71 |
+
"""
|
| 72 |
+
timeout = timeout if timeout else None # convert 0 or 0.0 to None
|
| 73 |
+
# NB: if the loop is not running *yet*, it is OK to submit work
|
| 74 |
+
# and we will wait for it
|
| 75 |
+
if loop is None or loop.is_closed():
|
| 76 |
+
raise RuntimeError("Loop is not running")
|
| 77 |
+
try:
|
| 78 |
+
loop0 = asyncio.events.get_running_loop()
|
| 79 |
+
if loop0 is loop:
|
| 80 |
+
raise NotImplementedError("Calling sync() from within a running loop")
|
| 81 |
+
except NotImplementedError:
|
| 82 |
+
raise
|
| 83 |
+
except RuntimeError:
|
| 84 |
+
pass
|
| 85 |
+
coro = func(*args, **kwargs)
|
| 86 |
+
result = [None]
|
| 87 |
+
event = threading.Event()
|
| 88 |
+
asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop)
|
| 89 |
+
while True:
|
| 90 |
+
# this loops allows thread to get interrupted
|
| 91 |
+
if event.wait(1):
|
| 92 |
+
break
|
| 93 |
+
if timeout is not None:
|
| 94 |
+
timeout -= 1
|
| 95 |
+
if timeout < 0:
|
| 96 |
+
raise FSTimeoutError
|
| 97 |
+
|
| 98 |
+
return_result = result[0]
|
| 99 |
+
if isinstance(return_result, asyncio.TimeoutError):
|
| 100 |
+
# suppress asyncio.TimeoutError, raise FSTimeoutError
|
| 101 |
+
raise FSTimeoutError from return_result
|
| 102 |
+
elif isinstance(return_result, BaseException):
|
| 103 |
+
raise return_result
|
| 104 |
+
else:
|
| 105 |
+
return return_result
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def sync_wrapper(func, obj=None):
|
| 109 |
+
"""Given a function, make so can be called in blocking contexts
|
| 110 |
+
|
| 111 |
+
Leave obj=None if defining within a class. Pass the instance if attaching
|
| 112 |
+
as an attribute of the instance.
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
@functools.wraps(func)
|
| 116 |
+
def wrapper(*args, **kwargs):
|
| 117 |
+
self = obj or args[0]
|
| 118 |
+
return sync(self.loop, func, *args, **kwargs)
|
| 119 |
+
|
| 120 |
+
return wrapper
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@contextmanager
|
| 124 |
+
def _selector_policy():
|
| 125 |
+
original_policy = asyncio.get_event_loop_policy()
|
| 126 |
+
try:
|
| 127 |
+
if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
|
| 128 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 129 |
+
|
| 130 |
+
yield
|
| 131 |
+
finally:
|
| 132 |
+
asyncio.set_event_loop_policy(original_policy)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def get_loop():
|
| 136 |
+
"""Create or return the default fsspec IO loop
|
| 137 |
+
|
| 138 |
+
The loop will be running on a separate thread.
|
| 139 |
+
"""
|
| 140 |
+
if loop[0] is None:
|
| 141 |
+
with get_lock():
|
| 142 |
+
# repeat the check just in case the loop got filled between the
|
| 143 |
+
# previous two calls from another thread
|
| 144 |
+
if loop[0] is None:
|
| 145 |
+
with _selector_policy():
|
| 146 |
+
loop[0] = asyncio.new_event_loop()
|
| 147 |
+
th = threading.Thread(target=loop[0].run_forever, name="fsspecIO")
|
| 148 |
+
th.daemon = True
|
| 149 |
+
th.start()
|
| 150 |
+
iothread[0] = th
|
| 151 |
+
return loop[0]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
if TYPE_CHECKING:
|
| 155 |
+
import resource
|
| 156 |
+
|
| 157 |
+
ResourceError = resource.error
|
| 158 |
+
else:
|
| 159 |
+
try:
|
| 160 |
+
import resource
|
| 161 |
+
except ImportError:
|
| 162 |
+
resource = None
|
| 163 |
+
ResourceError = OSError
|
| 164 |
+
else:
|
| 165 |
+
ResourceError = getattr(resource, "error", OSError)
|
| 166 |
+
|
| 167 |
+
_DEFAULT_BATCH_SIZE = 128
|
| 168 |
+
_NOFILES_DEFAULT_BATCH_SIZE = 1280
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _get_batch_size(nofiles=False):
|
| 172 |
+
from fsspec.config import conf
|
| 173 |
+
|
| 174 |
+
if nofiles:
|
| 175 |
+
if "nofiles_gather_batch_size" in conf:
|
| 176 |
+
return conf["nofiles_gather_batch_size"]
|
| 177 |
+
else:
|
| 178 |
+
if "gather_batch_size" in conf:
|
| 179 |
+
return conf["gather_batch_size"]
|
| 180 |
+
if nofiles:
|
| 181 |
+
return _NOFILES_DEFAULT_BATCH_SIZE
|
| 182 |
+
if resource is None:
|
| 183 |
+
return _DEFAULT_BATCH_SIZE
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
| 187 |
+
except (ImportError, ValueError, ResourceError):
|
| 188 |
+
return _DEFAULT_BATCH_SIZE
|
| 189 |
+
|
| 190 |
+
if soft_limit == resource.RLIM_INFINITY:
|
| 191 |
+
return -1
|
| 192 |
+
else:
|
| 193 |
+
return soft_limit // 8
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def running_async() -> bool:
|
| 197 |
+
"""Being executed by an event loop?"""
|
| 198 |
+
try:
|
| 199 |
+
asyncio.get_running_loop()
|
| 200 |
+
return True
|
| 201 |
+
except RuntimeError:
|
| 202 |
+
return False
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
async def _run_coros_in_chunks(
|
| 206 |
+
coros,
|
| 207 |
+
batch_size=None,
|
| 208 |
+
callback=DEFAULT_CALLBACK,
|
| 209 |
+
timeout=None,
|
| 210 |
+
return_exceptions=False,
|
| 211 |
+
nofiles=False,
|
| 212 |
+
):
|
| 213 |
+
"""Run the given coroutines in chunks.
|
| 214 |
+
|
| 215 |
+
Parameters
|
| 216 |
+
----------
|
| 217 |
+
coros: list of coroutines to run
|
| 218 |
+
batch_size: int or None
|
| 219 |
+
Number of coroutines to submit/wait on simultaneously.
|
| 220 |
+
If -1, then it will not be any throttling. If
|
| 221 |
+
None, it will be inferred from _get_batch_size()
|
| 222 |
+
callback: fsspec.callbacks.Callback instance
|
| 223 |
+
Gets a relative_update when each coroutine completes
|
| 224 |
+
timeout: number or None
|
| 225 |
+
If given, each coroutine times out after this time. Note that, since
|
| 226 |
+
there are multiple batches, the total run time of this function will in
|
| 227 |
+
general be longer
|
| 228 |
+
return_exceptions: bool
|
| 229 |
+
Same meaning as in asyncio.gather
|
| 230 |
+
nofiles: bool
|
| 231 |
+
If inferring the batch_size, does this operation involve local files?
|
| 232 |
+
If yes, you normally expect smaller batches.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
if batch_size is None:
|
| 236 |
+
batch_size = _get_batch_size(nofiles=nofiles)
|
| 237 |
+
|
| 238 |
+
if batch_size == -1:
|
| 239 |
+
batch_size = len(coros)
|
| 240 |
+
|
| 241 |
+
assert batch_size > 0
|
| 242 |
+
|
| 243 |
+
async def _run_coro(coro, i):
|
| 244 |
+
try:
|
| 245 |
+
return await asyncio.wait_for(coro, timeout=timeout), i
|
| 246 |
+
except Exception as e:
|
| 247 |
+
if not return_exceptions:
|
| 248 |
+
raise
|
| 249 |
+
return e, i
|
| 250 |
+
finally:
|
| 251 |
+
callback.relative_update(1)
|
| 252 |
+
|
| 253 |
+
i = 0
|
| 254 |
+
n = len(coros)
|
| 255 |
+
results = [None] * n
|
| 256 |
+
pending = set()
|
| 257 |
+
|
| 258 |
+
while pending or i < n:
|
| 259 |
+
while len(pending) < batch_size and i < n:
|
| 260 |
+
pending.add(asyncio.ensure_future(_run_coro(coros[i], i)))
|
| 261 |
+
i += 1
|
| 262 |
+
|
| 263 |
+
if not pending:
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
|
| 267 |
+
while done:
|
| 268 |
+
result, k = await done.pop()
|
| 269 |
+
results[k] = result
|
| 270 |
+
|
| 271 |
+
return results
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# these methods should be implemented as async by any async-able backend
|
| 275 |
+
async_methods = [
|
| 276 |
+
"_ls",
|
| 277 |
+
"_cat_file",
|
| 278 |
+
"_get_file",
|
| 279 |
+
"_put_file",
|
| 280 |
+
"_rm_file",
|
| 281 |
+
"_cp_file",
|
| 282 |
+
"_pipe_file",
|
| 283 |
+
"_expand_path",
|
| 284 |
+
"_info",
|
| 285 |
+
"_isfile",
|
| 286 |
+
"_isdir",
|
| 287 |
+
"_exists",
|
| 288 |
+
"_walk",
|
| 289 |
+
"_glob",
|
| 290 |
+
"_find",
|
| 291 |
+
"_du",
|
| 292 |
+
"_size",
|
| 293 |
+
"_mkdir",
|
| 294 |
+
"_makedirs",
|
| 295 |
+
]
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
class AsyncFileSystem(AbstractFileSystem):
|
| 299 |
+
"""Async file operations, default implementations
|
| 300 |
+
|
| 301 |
+
Passes bulk operations to asyncio.gather for concurrent operation.
|
| 302 |
+
|
| 303 |
+
Implementations that have concurrent batch operations and/or async methods
|
| 304 |
+
should inherit from this class instead of AbstractFileSystem. Docstrings are
|
| 305 |
+
copied from the un-underscored method in AbstractFileSystem, if not given.
|
| 306 |
+
"""
|
| 307 |
+
|
| 308 |
+
# note that methods do not have docstring here; they will be copied
|
| 309 |
+
# for _* methods and inferred for overridden methods.
|
| 310 |
+
|
| 311 |
+
async_impl = True
|
| 312 |
+
mirror_sync_methods = True
|
| 313 |
+
disable_throttling = False
|
| 314 |
+
|
| 315 |
+
def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
|
| 316 |
+
self.asynchronous = asynchronous
|
| 317 |
+
self._pid = os.getpid()
|
| 318 |
+
if not asynchronous:
|
| 319 |
+
self._loop = loop or get_loop()
|
| 320 |
+
else:
|
| 321 |
+
self._loop = None
|
| 322 |
+
self.batch_size = batch_size
|
| 323 |
+
super().__init__(*args, **kwargs)
|
| 324 |
+
|
| 325 |
+
@property
|
| 326 |
+
def loop(self):
|
| 327 |
+
if self._pid != os.getpid():
|
| 328 |
+
raise RuntimeError("This class is not fork-safe")
|
| 329 |
+
return self._loop
|
| 330 |
+
|
| 331 |
+
async def _rm_file(self, path, **kwargs):
|
| 332 |
+
raise NotImplementedError
|
| 333 |
+
|
| 334 |
+
async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
|
| 335 |
+
# TODO: implement on_error
|
| 336 |
+
batch_size = batch_size or self.batch_size
|
| 337 |
+
path = await self._expand_path(path, recursive=recursive)
|
| 338 |
+
return await _run_coros_in_chunks(
|
| 339 |
+
[self._rm_file(p, **kwargs) for p in reversed(path)],
|
| 340 |
+
batch_size=batch_size,
|
| 341 |
+
nofiles=True,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
async def _cp_file(self, path1, path2, **kwargs):
|
| 345 |
+
raise NotImplementedError
|
| 346 |
+
|
| 347 |
+
async def _mv_file(self, path1, path2):
|
| 348 |
+
await self._cp_file(path1, path2)
|
| 349 |
+
await self._rm_file(path1)
|
| 350 |
+
|
| 351 |
+
async def _copy(
|
| 352 |
+
self,
|
| 353 |
+
path1,
|
| 354 |
+
path2,
|
| 355 |
+
recursive=False,
|
| 356 |
+
on_error=None,
|
| 357 |
+
maxdepth=None,
|
| 358 |
+
batch_size=None,
|
| 359 |
+
**kwargs,
|
| 360 |
+
):
|
| 361 |
+
if on_error is None and recursive:
|
| 362 |
+
on_error = "ignore"
|
| 363 |
+
elif on_error is None:
|
| 364 |
+
on_error = "raise"
|
| 365 |
+
|
| 366 |
+
if isinstance(path1, list) and isinstance(path2, list):
|
| 367 |
+
# No need to expand paths when both source and destination
|
| 368 |
+
# are provided as lists
|
| 369 |
+
paths1 = path1
|
| 370 |
+
paths2 = path2
|
| 371 |
+
else:
|
| 372 |
+
source_is_str = isinstance(path1, str)
|
| 373 |
+
paths1 = await self._expand_path(
|
| 374 |
+
path1, maxdepth=maxdepth, recursive=recursive
|
| 375 |
+
)
|
| 376 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 377 |
+
# Non-recursive glob does not copy directories
|
| 378 |
+
paths1 = [
|
| 379 |
+
p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
|
| 380 |
+
]
|
| 381 |
+
if not paths1:
|
| 382 |
+
return
|
| 383 |
+
|
| 384 |
+
source_is_file = len(paths1) == 1
|
| 385 |
+
dest_is_dir = isinstance(path2, str) and (
|
| 386 |
+
trailing_sep(path2) or await self._isdir(path2)
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
exists = source_is_str and (
|
| 390 |
+
(has_magic(path1) and source_is_file)
|
| 391 |
+
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
|
| 392 |
+
)
|
| 393 |
+
paths2 = other_paths(
|
| 394 |
+
paths1,
|
| 395 |
+
path2,
|
| 396 |
+
exists=exists,
|
| 397 |
+
flatten=not source_is_str,
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
batch_size = batch_size or self.batch_size
|
| 401 |
+
coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
|
| 402 |
+
result = await _run_coros_in_chunks(
|
| 403 |
+
coros, batch_size=batch_size, return_exceptions=True, nofiles=True
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
for ex in filter(is_exception, result):
|
| 407 |
+
if on_error == "ignore" and isinstance(ex, FileNotFoundError):
|
| 408 |
+
continue
|
| 409 |
+
raise ex
|
| 410 |
+
|
| 411 |
+
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
|
| 412 |
+
raise NotImplementedError
|
| 413 |
+
|
| 414 |
+
async def _pipe(self, path, value=None, batch_size=None, **kwargs):
|
| 415 |
+
if isinstance(path, str):
|
| 416 |
+
path = {path: value}
|
| 417 |
+
batch_size = batch_size or self.batch_size
|
| 418 |
+
return await _run_coros_in_chunks(
|
| 419 |
+
[self._pipe_file(k, v, **kwargs) for k, v in path.items()],
|
| 420 |
+
batch_size=batch_size,
|
| 421 |
+
nofiles=True,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
async def _process_limits(self, url, start, end):
|
| 425 |
+
"""Helper for "Range"-based _cat_file"""
|
| 426 |
+
size = None
|
| 427 |
+
suff = False
|
| 428 |
+
if start is not None and start < 0:
|
| 429 |
+
# if start is negative and end None, end is the "suffix length"
|
| 430 |
+
if end is None:
|
| 431 |
+
end = -start
|
| 432 |
+
start = ""
|
| 433 |
+
suff = True
|
| 434 |
+
else:
|
| 435 |
+
size = size or (await self._info(url))["size"]
|
| 436 |
+
start = size + start
|
| 437 |
+
elif start is None:
|
| 438 |
+
start = 0
|
| 439 |
+
if not suff:
|
| 440 |
+
if end is not None and end < 0:
|
| 441 |
+
if start is not None:
|
| 442 |
+
size = size or (await self._info(url))["size"]
|
| 443 |
+
end = size + end
|
| 444 |
+
elif end is None:
|
| 445 |
+
end = ""
|
| 446 |
+
if isinstance(end, numbers.Integral):
|
| 447 |
+
end -= 1 # bytes range is inclusive
|
| 448 |
+
return f"bytes={start}-{end}"
|
| 449 |
+
|
| 450 |
+
async def _cat_file(self, path, start=None, end=None, **kwargs):
|
| 451 |
+
raise NotImplementedError
|
| 452 |
+
|
| 453 |
+
async def _cat(
|
| 454 |
+
self, path, recursive=False, on_error="raise", batch_size=None, **kwargs
|
| 455 |
+
):
|
| 456 |
+
paths = await self._expand_path(path, recursive=recursive)
|
| 457 |
+
coros = [self._cat_file(path, **kwargs) for path in paths]
|
| 458 |
+
batch_size = batch_size or self.batch_size
|
| 459 |
+
out = await _run_coros_in_chunks(
|
| 460 |
+
coros, batch_size=batch_size, nofiles=True, return_exceptions=True
|
| 461 |
+
)
|
| 462 |
+
if on_error == "raise":
|
| 463 |
+
ex = next(filter(is_exception, out), False)
|
| 464 |
+
if ex:
|
| 465 |
+
raise ex
|
| 466 |
+
if (
|
| 467 |
+
len(paths) > 1
|
| 468 |
+
or isinstance(path, list)
|
| 469 |
+
or paths[0] != self._strip_protocol(path)
|
| 470 |
+
):
|
| 471 |
+
return {
|
| 472 |
+
k: v
|
| 473 |
+
for k, v in zip(paths, out)
|
| 474 |
+
if on_error != "omit" or not is_exception(v)
|
| 475 |
+
}
|
| 476 |
+
else:
|
| 477 |
+
return out[0]
|
| 478 |
+
|
| 479 |
+
async def _cat_ranges(
|
| 480 |
+
self,
|
| 481 |
+
paths,
|
| 482 |
+
starts,
|
| 483 |
+
ends,
|
| 484 |
+
max_gap=None,
|
| 485 |
+
batch_size=None,
|
| 486 |
+
on_error="return",
|
| 487 |
+
**kwargs,
|
| 488 |
+
):
|
| 489 |
+
"""Get the contents of byte ranges from one or more files
|
| 490 |
+
|
| 491 |
+
Parameters
|
| 492 |
+
----------
|
| 493 |
+
paths: list
|
| 494 |
+
A list of of filepaths on this filesystems
|
| 495 |
+
starts, ends: int or list
|
| 496 |
+
Bytes limits of the read. If using a single int, the same value will be
|
| 497 |
+
used to read all the specified files.
|
| 498 |
+
"""
|
| 499 |
+
# TODO: on_error
|
| 500 |
+
if max_gap is not None:
|
| 501 |
+
# use utils.merge_offset_ranges
|
| 502 |
+
raise NotImplementedError
|
| 503 |
+
if not isinstance(paths, list):
|
| 504 |
+
raise TypeError
|
| 505 |
+
if not isinstance(starts, Iterable):
|
| 506 |
+
starts = [starts] * len(paths)
|
| 507 |
+
if not isinstance(ends, Iterable):
|
| 508 |
+
ends = [ends] * len(paths)
|
| 509 |
+
if len(starts) != len(paths) or len(ends) != len(paths):
|
| 510 |
+
raise ValueError
|
| 511 |
+
coros = [
|
| 512 |
+
self._cat_file(p, start=s, end=e, **kwargs)
|
| 513 |
+
for p, s, e in zip(paths, starts, ends)
|
| 514 |
+
]
|
| 515 |
+
batch_size = batch_size or self.batch_size
|
| 516 |
+
return await _run_coros_in_chunks(
|
| 517 |
+
coros, batch_size=batch_size, nofiles=True, return_exceptions=True
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
|
| 521 |
+
raise NotImplementedError
|
| 522 |
+
|
| 523 |
+
async def _put(
|
| 524 |
+
self,
|
| 525 |
+
lpath,
|
| 526 |
+
rpath,
|
| 527 |
+
recursive=False,
|
| 528 |
+
callback=DEFAULT_CALLBACK,
|
| 529 |
+
batch_size=None,
|
| 530 |
+
maxdepth=None,
|
| 531 |
+
**kwargs,
|
| 532 |
+
):
|
| 533 |
+
"""Copy file(s) from local.
|
| 534 |
+
|
| 535 |
+
Copies a specific file or tree of files (if recursive=True). If rpath
|
| 536 |
+
ends with a "/", it will be assumed to be a directory, and target files
|
| 537 |
+
will go within.
|
| 538 |
+
|
| 539 |
+
The put_file method will be called concurrently on a batch of files. The
|
| 540 |
+
batch_size option can configure the amount of futures that can be executed
|
| 541 |
+
at the same time. If it is -1, then all the files will be uploaded concurrently.
|
| 542 |
+
The default can be set for this instance by passing "batch_size" in the
|
| 543 |
+
constructor, or for all instances by setting the "gather_batch_size" key
|
| 544 |
+
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
|
| 545 |
+
"""
|
| 546 |
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
| 547 |
+
# No need to expand paths when both source and destination
|
| 548 |
+
# are provided as lists
|
| 549 |
+
rpaths = rpath
|
| 550 |
+
lpaths = lpath
|
| 551 |
+
else:
|
| 552 |
+
source_is_str = isinstance(lpath, str)
|
| 553 |
+
if source_is_str:
|
| 554 |
+
lpath = make_path_posix(lpath)
|
| 555 |
+
fs = LocalFileSystem()
|
| 556 |
+
lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
|
| 557 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 558 |
+
# Non-recursive glob does not copy directories
|
| 559 |
+
lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
|
| 560 |
+
if not lpaths:
|
| 561 |
+
return
|
| 562 |
+
|
| 563 |
+
source_is_file = len(lpaths) == 1
|
| 564 |
+
dest_is_dir = isinstance(rpath, str) and (
|
| 565 |
+
trailing_sep(rpath) or await self._isdir(rpath)
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
rpath = self._strip_protocol(rpath)
|
| 569 |
+
exists = source_is_str and (
|
| 570 |
+
(has_magic(lpath) and source_is_file)
|
| 571 |
+
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
|
| 572 |
+
)
|
| 573 |
+
rpaths = other_paths(
|
| 574 |
+
lpaths,
|
| 575 |
+
rpath,
|
| 576 |
+
exists=exists,
|
| 577 |
+
flatten=not source_is_str,
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
is_dir = {l: os.path.isdir(l) for l in lpaths}
|
| 581 |
+
rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
|
| 582 |
+
file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]]
|
| 583 |
+
|
| 584 |
+
await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs])
|
| 585 |
+
batch_size = batch_size or self.batch_size
|
| 586 |
+
|
| 587 |
+
coros = []
|
| 588 |
+
callback.set_size(len(file_pairs))
|
| 589 |
+
for lfile, rfile in file_pairs:
|
| 590 |
+
put_file = callback.branch_coro(self._put_file)
|
| 591 |
+
coros.append(put_file(lfile, rfile, **kwargs))
|
| 592 |
+
|
| 593 |
+
return await _run_coros_in_chunks(
|
| 594 |
+
coros, batch_size=batch_size, callback=callback
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
async def _get_file(self, rpath, lpath, **kwargs):
|
| 598 |
+
raise NotImplementedError
|
| 599 |
+
|
| 600 |
+
async def _get(
|
| 601 |
+
self,
|
| 602 |
+
rpath,
|
| 603 |
+
lpath,
|
| 604 |
+
recursive=False,
|
| 605 |
+
callback=DEFAULT_CALLBACK,
|
| 606 |
+
maxdepth=None,
|
| 607 |
+
**kwargs,
|
| 608 |
+
):
|
| 609 |
+
"""Copy file(s) to local.
|
| 610 |
+
|
| 611 |
+
Copies a specific file or tree of files (if recursive=True). If lpath
|
| 612 |
+
ends with a "/", it will be assumed to be a directory, and target files
|
| 613 |
+
will go within. Can submit a list of paths, which may be glob-patterns
|
| 614 |
+
and will be expanded.
|
| 615 |
+
|
| 616 |
+
The get_file method will be called concurrently on a batch of files. The
|
| 617 |
+
batch_size option can configure the amount of futures that can be executed
|
| 618 |
+
at the same time. If it is -1, then all the files will be uploaded concurrently.
|
| 619 |
+
The default can be set for this instance by passing "batch_size" in the
|
| 620 |
+
constructor, or for all instances by setting the "gather_batch_size" key
|
| 621 |
+
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
|
| 622 |
+
"""
|
| 623 |
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
| 624 |
+
# No need to expand paths when both source and destination
|
| 625 |
+
# are provided as lists
|
| 626 |
+
rpaths = rpath
|
| 627 |
+
lpaths = lpath
|
| 628 |
+
else:
|
| 629 |
+
source_is_str = isinstance(rpath, str)
|
| 630 |
+
# First check for rpath trailing slash as _strip_protocol removes it.
|
| 631 |
+
source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
|
| 632 |
+
rpath = self._strip_protocol(rpath)
|
| 633 |
+
rpaths = await self._expand_path(
|
| 634 |
+
rpath, recursive=recursive, maxdepth=maxdepth
|
| 635 |
+
)
|
| 636 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 637 |
+
# Non-recursive glob does not copy directories
|
| 638 |
+
rpaths = [
|
| 639 |
+
p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
|
| 640 |
+
]
|
| 641 |
+
if not rpaths:
|
| 642 |
+
return
|
| 643 |
+
|
| 644 |
+
lpath = make_path_posix(lpath)
|
| 645 |
+
source_is_file = len(rpaths) == 1
|
| 646 |
+
dest_is_dir = isinstance(lpath, str) and (
|
| 647 |
+
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
exists = source_is_str and (
|
| 651 |
+
(has_magic(rpath) and source_is_file)
|
| 652 |
+
or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
|
| 653 |
+
)
|
| 654 |
+
lpaths = other_paths(
|
| 655 |
+
rpaths,
|
| 656 |
+
lpath,
|
| 657 |
+
exists=exists,
|
| 658 |
+
flatten=not source_is_str,
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
[os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
|
| 662 |
+
batch_size = kwargs.pop("batch_size", self.batch_size)
|
| 663 |
+
|
| 664 |
+
coros = []
|
| 665 |
+
callback.set_size(len(lpaths))
|
| 666 |
+
for lpath, rpath in zip(lpaths, rpaths):
|
| 667 |
+
get_file = callback.branch_coro(self._get_file)
|
| 668 |
+
coros.append(get_file(rpath, lpath, **kwargs))
|
| 669 |
+
return await _run_coros_in_chunks(
|
| 670 |
+
coros, batch_size=batch_size, callback=callback
|
| 671 |
+
)
|
| 672 |
+
|
| 673 |
+
async def _isfile(self, path):
|
| 674 |
+
try:
|
| 675 |
+
return (await self._info(path))["type"] == "file"
|
| 676 |
+
except: # noqa: E722
|
| 677 |
+
return False
|
| 678 |
+
|
| 679 |
+
async def _isdir(self, path):
|
| 680 |
+
try:
|
| 681 |
+
return (await self._info(path))["type"] == "directory"
|
| 682 |
+
except OSError:
|
| 683 |
+
return False
|
| 684 |
+
|
| 685 |
+
async def _size(self, path):
|
| 686 |
+
return (await self._info(path)).get("size", None)
|
| 687 |
+
|
| 688 |
+
async def _sizes(self, paths, batch_size=None):
|
| 689 |
+
batch_size = batch_size or self.batch_size
|
| 690 |
+
return await _run_coros_in_chunks(
|
| 691 |
+
[self._size(p) for p in paths], batch_size=batch_size
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
async def _exists(self, path, **kwargs):
|
| 695 |
+
try:
|
| 696 |
+
await self._info(path, **kwargs)
|
| 697 |
+
return True
|
| 698 |
+
except FileNotFoundError:
|
| 699 |
+
return False
|
| 700 |
+
|
| 701 |
+
async def _info(self, path, **kwargs):
|
| 702 |
+
raise NotImplementedError
|
| 703 |
+
|
| 704 |
+
async def _ls(self, path, detail=True, **kwargs):
|
| 705 |
+
raise NotImplementedError
|
| 706 |
+
|
| 707 |
+
async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
|
| 708 |
+
if maxdepth is not None and maxdepth < 1:
|
| 709 |
+
raise ValueError("maxdepth must be at least 1")
|
| 710 |
+
|
| 711 |
+
path = self._strip_protocol(path)
|
| 712 |
+
full_dirs = {}
|
| 713 |
+
dirs = {}
|
| 714 |
+
files = {}
|
| 715 |
+
|
| 716 |
+
detail = kwargs.pop("detail", False)
|
| 717 |
+
try:
|
| 718 |
+
listing = await self._ls(path, detail=True, **kwargs)
|
| 719 |
+
except (FileNotFoundError, OSError) as e:
|
| 720 |
+
if on_error == "raise":
|
| 721 |
+
raise
|
| 722 |
+
elif callable(on_error):
|
| 723 |
+
on_error(e)
|
| 724 |
+
if detail:
|
| 725 |
+
yield path, {}, {}
|
| 726 |
+
else:
|
| 727 |
+
yield path, [], []
|
| 728 |
+
return
|
| 729 |
+
|
| 730 |
+
for info in listing:
|
| 731 |
+
# each info name must be at least [path]/part , but here
|
| 732 |
+
# we check also for names like [path]/part/
|
| 733 |
+
pathname = info["name"].rstrip("/")
|
| 734 |
+
name = pathname.rsplit("/", 1)[-1]
|
| 735 |
+
if info["type"] == "directory" and pathname != path:
|
| 736 |
+
# do not include "self" path
|
| 737 |
+
full_dirs[name] = pathname
|
| 738 |
+
dirs[name] = info
|
| 739 |
+
elif pathname == path:
|
| 740 |
+
# file-like with same name as give path
|
| 741 |
+
files[""] = info
|
| 742 |
+
else:
|
| 743 |
+
files[name] = info
|
| 744 |
+
|
| 745 |
+
if detail:
|
| 746 |
+
yield path, dirs, files
|
| 747 |
+
else:
|
| 748 |
+
yield path, list(dirs), list(files)
|
| 749 |
+
|
| 750 |
+
if maxdepth is not None:
|
| 751 |
+
maxdepth -= 1
|
| 752 |
+
if maxdepth < 1:
|
| 753 |
+
return
|
| 754 |
+
|
| 755 |
+
for d in dirs:
|
| 756 |
+
async for _ in self._walk(
|
| 757 |
+
full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs
|
| 758 |
+
):
|
| 759 |
+
yield _
|
| 760 |
+
|
| 761 |
+
async def _glob(self, path, maxdepth=None, **kwargs):
|
| 762 |
+
if maxdepth is not None and maxdepth < 1:
|
| 763 |
+
raise ValueError("maxdepth must be at least 1")
|
| 764 |
+
|
| 765 |
+
import re
|
| 766 |
+
|
| 767 |
+
seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
|
| 768 |
+
ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
|
| 769 |
+
path = self._strip_protocol(path)
|
| 770 |
+
append_slash_to_dirname = ends_with_sep or path.endswith(
|
| 771 |
+
tuple(sep + "**" for sep in seps)
|
| 772 |
+
)
|
| 773 |
+
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
| 774 |
+
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
|
| 775 |
+
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
| 776 |
+
|
| 777 |
+
min_idx = min(idx_star, idx_qmark, idx_brace)
|
| 778 |
+
|
| 779 |
+
detail = kwargs.pop("detail", False)
|
| 780 |
+
|
| 781 |
+
if not has_magic(path):
|
| 782 |
+
if await self._exists(path, **kwargs):
|
| 783 |
+
if not detail:
|
| 784 |
+
return [path]
|
| 785 |
+
else:
|
| 786 |
+
return {path: await self._info(path, **kwargs)}
|
| 787 |
+
else:
|
| 788 |
+
if not detail:
|
| 789 |
+
return [] # glob of non-existent returns empty
|
| 790 |
+
else:
|
| 791 |
+
return {}
|
| 792 |
+
elif "/" in path[:min_idx]:
|
| 793 |
+
min_idx = path[:min_idx].rindex("/")
|
| 794 |
+
root = path[: min_idx + 1]
|
| 795 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
| 796 |
+
else:
|
| 797 |
+
root = ""
|
| 798 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
| 799 |
+
|
| 800 |
+
if "**" in path:
|
| 801 |
+
if maxdepth is not None:
|
| 802 |
+
idx_double_stars = path.find("**")
|
| 803 |
+
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
| 804 |
+
depth = depth - depth_double_stars + maxdepth
|
| 805 |
+
else:
|
| 806 |
+
depth = None
|
| 807 |
+
|
| 808 |
+
allpaths = await self._find(
|
| 809 |
+
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
|
| 813 |
+
pattern = re.compile(pattern)
|
| 814 |
+
|
| 815 |
+
out = {
|
| 816 |
+
p: info
|
| 817 |
+
for p, info in sorted(allpaths.items())
|
| 818 |
+
if pattern.match(
|
| 819 |
+
p + "/"
|
| 820 |
+
if append_slash_to_dirname and info["type"] == "directory"
|
| 821 |
+
else p
|
| 822 |
+
)
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
if detail:
|
| 826 |
+
return out
|
| 827 |
+
else:
|
| 828 |
+
return list(out)
|
| 829 |
+
|
| 830 |
+
async def _du(self, path, total=True, maxdepth=None, **kwargs):
|
| 831 |
+
sizes = {}
|
| 832 |
+
# async for?
|
| 833 |
+
for f in await self._find(path, maxdepth=maxdepth, **kwargs):
|
| 834 |
+
info = await self._info(f)
|
| 835 |
+
sizes[info["name"]] = info["size"]
|
| 836 |
+
if total:
|
| 837 |
+
return sum(sizes.values())
|
| 838 |
+
else:
|
| 839 |
+
return sizes
|
| 840 |
+
|
| 841 |
+
async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
|
| 842 |
+
path = self._strip_protocol(path)
|
| 843 |
+
out = {}
|
| 844 |
+
detail = kwargs.pop("detail", False)
|
| 845 |
+
|
| 846 |
+
# Add the root directory if withdirs is requested
|
| 847 |
+
# This is needed for posix glob compliance
|
| 848 |
+
if withdirs and path != "" and await self._isdir(path):
|
| 849 |
+
out[path] = await self._info(path)
|
| 850 |
+
|
| 851 |
+
# async for?
|
| 852 |
+
async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
|
| 853 |
+
if withdirs:
|
| 854 |
+
files.update(dirs)
|
| 855 |
+
out.update({info["name"]: info for name, info in files.items()})
|
| 856 |
+
if not out and (await self._isfile(path)):
|
| 857 |
+
# walk works on directories, but find should also return [path]
|
| 858 |
+
# when path happens to be a file
|
| 859 |
+
out[path] = {}
|
| 860 |
+
names = sorted(out)
|
| 861 |
+
if not detail:
|
| 862 |
+
return names
|
| 863 |
+
else:
|
| 864 |
+
return {name: out[name] for name in names}
|
| 865 |
+
|
| 866 |
+
async def _expand_path(self, path, recursive=False, maxdepth=None):
|
| 867 |
+
if maxdepth is not None and maxdepth < 1:
|
| 868 |
+
raise ValueError("maxdepth must be at least 1")
|
| 869 |
+
|
| 870 |
+
if isinstance(path, str):
|
| 871 |
+
out = await self._expand_path([path], recursive, maxdepth)
|
| 872 |
+
else:
|
| 873 |
+
out = set()
|
| 874 |
+
path = [self._strip_protocol(p) for p in path]
|
| 875 |
+
for p in path: # can gather here
|
| 876 |
+
if has_magic(p):
|
| 877 |
+
bit = set(await self._glob(p, maxdepth=maxdepth))
|
| 878 |
+
out |= bit
|
| 879 |
+
if recursive:
|
| 880 |
+
# glob call above expanded one depth so if maxdepth is defined
|
| 881 |
+
# then decrement it in expand_path call below. If it is zero
|
| 882 |
+
# after decrementing then avoid expand_path call.
|
| 883 |
+
if maxdepth is not None and maxdepth <= 1:
|
| 884 |
+
continue
|
| 885 |
+
out |= set(
|
| 886 |
+
await self._expand_path(
|
| 887 |
+
list(bit),
|
| 888 |
+
recursive=recursive,
|
| 889 |
+
maxdepth=maxdepth - 1 if maxdepth is not None else None,
|
| 890 |
+
)
|
| 891 |
+
)
|
| 892 |
+
continue
|
| 893 |
+
elif recursive:
|
| 894 |
+
rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True))
|
| 895 |
+
out |= rec
|
| 896 |
+
if p not in out and (recursive is False or (await self._exists(p))):
|
| 897 |
+
# should only check once, for the root
|
| 898 |
+
out.add(p)
|
| 899 |
+
if not out:
|
| 900 |
+
raise FileNotFoundError(path)
|
| 901 |
+
return sorted(out)
|
| 902 |
+
|
| 903 |
+
async def _mkdir(self, path, create_parents=True, **kwargs):
|
| 904 |
+
pass # not necessary to implement, may not have directories
|
| 905 |
+
|
| 906 |
+
async def _makedirs(self, path, exist_ok=False):
|
| 907 |
+
pass # not necessary to implement, may not have directories
|
| 908 |
+
|
| 909 |
+
async def open_async(self, path, mode="rb", **kwargs):
|
| 910 |
+
if "b" not in mode or kwargs.get("compression"):
|
| 911 |
+
raise ValueError
|
| 912 |
+
raise NotImplementedError
|
| 913 |
+
|
| 914 |
+
|
| 915 |
+
def mirror_sync_methods(obj):
|
| 916 |
+
"""Populate sync and async methods for obj
|
| 917 |
+
|
| 918 |
+
For each method will create a sync version if the name refers to an async method
|
| 919 |
+
(coroutine) and there is no override in the child class; will create an async
|
| 920 |
+
method for the corresponding sync method if there is no implementation.
|
| 921 |
+
|
| 922 |
+
Uses the methods specified in
|
| 923 |
+
- async_methods: the set that an implementation is expected to provide
|
| 924 |
+
- default_async_methods: that can be derived from their sync version in
|
| 925 |
+
AbstractFileSystem
|
| 926 |
+
- AsyncFileSystem: async-specific default coroutines
|
| 927 |
+
"""
|
| 928 |
+
from fsspec import AbstractFileSystem
|
| 929 |
+
|
| 930 |
+
for method in async_methods + dir(AsyncFileSystem):
|
| 931 |
+
if not method.startswith("_"):
|
| 932 |
+
continue
|
| 933 |
+
smethod = method[1:]
|
| 934 |
+
if private.match(method):
|
| 935 |
+
isco = inspect.iscoroutinefunction(getattr(obj, method, None))
|
| 936 |
+
unsync = getattr(getattr(obj, smethod, False), "__func__", None)
|
| 937 |
+
is_default = unsync is getattr(AbstractFileSystem, smethod, "")
|
| 938 |
+
if isco and is_default:
|
| 939 |
+
mth = sync_wrapper(getattr(obj, method), obj=obj)
|
| 940 |
+
setattr(obj, smethod, mth)
|
| 941 |
+
if not mth.__doc__:
|
| 942 |
+
mth.__doc__ = getattr(
|
| 943 |
+
getattr(AbstractFileSystem, smethod, None), "__doc__", ""
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
+
|
| 947 |
+
class FSSpecCoroutineCancel(Exception):
|
| 948 |
+
pass
|
| 949 |
+
|
| 950 |
+
|
| 951 |
+
def _dump_running_tasks(
|
| 952 |
+
printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False
|
| 953 |
+
):
|
| 954 |
+
import traceback
|
| 955 |
+
|
| 956 |
+
tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()]
|
| 957 |
+
if printout:
|
| 958 |
+
[task.print_stack() for task in tasks]
|
| 959 |
+
out = [
|
| 960 |
+
{
|
| 961 |
+
"locals": task._coro.cr_frame.f_locals,
|
| 962 |
+
"file": task._coro.cr_frame.f_code.co_filename,
|
| 963 |
+
"firstline": task._coro.cr_frame.f_code.co_firstlineno,
|
| 964 |
+
"linelo": task._coro.cr_frame.f_lineno,
|
| 965 |
+
"stack": traceback.format_stack(task._coro.cr_frame),
|
| 966 |
+
"task": task if with_task else None,
|
| 967 |
+
}
|
| 968 |
+
for task in tasks
|
| 969 |
+
]
|
| 970 |
+
if cancel:
|
| 971 |
+
for t in tasks:
|
| 972 |
+
cbs = t._callbacks
|
| 973 |
+
t.cancel()
|
| 974 |
+
asyncio.futures.Future.set_exception(t, exc)
|
| 975 |
+
asyncio.futures.Future.cancel(t)
|
| 976 |
+
[cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures
|
| 977 |
+
try:
|
| 978 |
+
t._coro.throw(exc) # exits coro, unless explicitly handled
|
| 979 |
+
except exc:
|
| 980 |
+
pass
|
| 981 |
+
return out
|
| 982 |
+
|
| 983 |
+
|
| 984 |
+
class AbstractAsyncStreamedFile(AbstractBufferedFile):
|
| 985 |
+
# no read buffering, and always auto-commit
|
| 986 |
+
# TODO: readahead might still be useful here, but needs async version
|
| 987 |
+
|
| 988 |
+
async def read(self, length=-1):
|
| 989 |
+
"""
|
| 990 |
+
Return data from cache, or fetch pieces as necessary
|
| 991 |
+
|
| 992 |
+
Parameters
|
| 993 |
+
----------
|
| 994 |
+
length: int (-1)
|
| 995 |
+
Number of bytes to read; if <0, all remaining bytes.
|
| 996 |
+
"""
|
| 997 |
+
length = -1 if length is None else int(length)
|
| 998 |
+
if self.mode != "rb":
|
| 999 |
+
raise ValueError("File not in read mode")
|
| 1000 |
+
if length < 0:
|
| 1001 |
+
length = self.size - self.loc
|
| 1002 |
+
if self.closed:
|
| 1003 |
+
raise ValueError("I/O operation on closed file.")
|
| 1004 |
+
if length == 0:
|
| 1005 |
+
# don't even bother calling fetch
|
| 1006 |
+
return b""
|
| 1007 |
+
out = await self._fetch_range(self.loc, self.loc + length)
|
| 1008 |
+
self.loc += len(out)
|
| 1009 |
+
return out
|
| 1010 |
+
|
| 1011 |
+
async def write(self, data):
|
| 1012 |
+
"""
|
| 1013 |
+
Write data to buffer.
|
| 1014 |
+
|
| 1015 |
+
Buffer only sent on flush() or if buffer is greater than
|
| 1016 |
+
or equal to blocksize.
|
| 1017 |
+
|
| 1018 |
+
Parameters
|
| 1019 |
+
----------
|
| 1020 |
+
data: bytes
|
| 1021 |
+
Set of bytes to be written.
|
| 1022 |
+
"""
|
| 1023 |
+
if self.mode not in {"wb", "ab"}:
|
| 1024 |
+
raise ValueError("File not in write mode")
|
| 1025 |
+
if self.closed:
|
| 1026 |
+
raise ValueError("I/O operation on closed file.")
|
| 1027 |
+
if self.forced:
|
| 1028 |
+
raise ValueError("This file has been force-flushed, can only close")
|
| 1029 |
+
out = self.buffer.write(data)
|
| 1030 |
+
self.loc += out
|
| 1031 |
+
if self.buffer.tell() >= self.blocksize:
|
| 1032 |
+
await self.flush()
|
| 1033 |
+
return out
|
| 1034 |
+
|
| 1035 |
+
async def close(self):
|
| 1036 |
+
"""Close file
|
| 1037 |
+
|
| 1038 |
+
Finalizes writes, discards cache
|
| 1039 |
+
"""
|
| 1040 |
+
if getattr(self, "_unclosable", False):
|
| 1041 |
+
return
|
| 1042 |
+
if self.closed:
|
| 1043 |
+
return
|
| 1044 |
+
if self.mode == "rb":
|
| 1045 |
+
self.cache = None
|
| 1046 |
+
else:
|
| 1047 |
+
if not self.forced:
|
| 1048 |
+
await self.flush(force=True)
|
| 1049 |
+
|
| 1050 |
+
if self.fs is not None:
|
| 1051 |
+
self.fs.invalidate_cache(self.path)
|
| 1052 |
+
self.fs.invalidate_cache(self.fs._parent(self.path))
|
| 1053 |
+
|
| 1054 |
+
self.closed = True
|
| 1055 |
+
|
| 1056 |
+
async def flush(self, force=False):
|
| 1057 |
+
if self.closed:
|
| 1058 |
+
raise ValueError("Flush on closed file")
|
| 1059 |
+
if force and self.forced:
|
| 1060 |
+
raise ValueError("Force flush cannot be called more than once")
|
| 1061 |
+
if force:
|
| 1062 |
+
self.forced = True
|
| 1063 |
+
|
| 1064 |
+
if self.mode not in {"wb", "ab"}:
|
| 1065 |
+
# no-op to flush on read-mode
|
| 1066 |
+
return
|
| 1067 |
+
|
| 1068 |
+
if not force and self.buffer.tell() < self.blocksize:
|
| 1069 |
+
# Defer write on small block
|
| 1070 |
+
return
|
| 1071 |
+
|
| 1072 |
+
if self.offset is None:
|
| 1073 |
+
# Initialize a multipart upload
|
| 1074 |
+
self.offset = 0
|
| 1075 |
+
try:
|
| 1076 |
+
await self._initiate_upload()
|
| 1077 |
+
except:
|
| 1078 |
+
self.closed = True
|
| 1079 |
+
raise
|
| 1080 |
+
|
| 1081 |
+
if await self._upload_chunk(final=force) is not False:
|
| 1082 |
+
self.offset += self.buffer.seek(0, 2)
|
| 1083 |
+
self.buffer = io.BytesIO()
|
| 1084 |
+
|
| 1085 |
+
async def __aenter__(self):
|
| 1086 |
+
return self
|
| 1087 |
+
|
| 1088 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 1089 |
+
await self.close()
|
| 1090 |
+
|
| 1091 |
+
async def _fetch_range(self, start, end):
|
| 1092 |
+
raise NotImplementedError
|
| 1093 |
+
|
| 1094 |
+
async def _initiate_upload(self):
|
| 1095 |
+
pass
|
| 1096 |
+
|
| 1097 |
+
async def _upload_chunk(self, final=False):
|
| 1098 |
+
raise NotImplementedError
|
.venv/lib/python3.11/site-packages/fsspec/caching.py
ADDED
|
@@ -0,0 +1,966 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import collections
|
| 4 |
+
import functools
|
| 5 |
+
import logging
|
| 6 |
+
import math
|
| 7 |
+
import os
|
| 8 |
+
import threading
|
| 9 |
+
import warnings
|
| 10 |
+
from concurrent.futures import Future, ThreadPoolExecutor
|
| 11 |
+
from itertools import groupby
|
| 12 |
+
from operator import itemgetter
|
| 13 |
+
from typing import (
|
| 14 |
+
TYPE_CHECKING,
|
| 15 |
+
Any,
|
| 16 |
+
Callable,
|
| 17 |
+
ClassVar,
|
| 18 |
+
Generic,
|
| 19 |
+
NamedTuple,
|
| 20 |
+
Optional,
|
| 21 |
+
OrderedDict,
|
| 22 |
+
TypeVar,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
if TYPE_CHECKING:
|
| 26 |
+
import mmap
|
| 27 |
+
|
| 28 |
+
from typing_extensions import ParamSpec
|
| 29 |
+
|
| 30 |
+
P = ParamSpec("P")
|
| 31 |
+
else:
|
| 32 |
+
P = TypeVar("P")
|
| 33 |
+
|
| 34 |
+
T = TypeVar("T")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
logger = logging.getLogger("fsspec")
|
| 38 |
+
|
| 39 |
+
Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class BaseCache:
|
| 43 |
+
"""Pass-though cache: doesn't keep anything, calls every time
|
| 44 |
+
|
| 45 |
+
Acts as base class for other cachers
|
| 46 |
+
|
| 47 |
+
Parameters
|
| 48 |
+
----------
|
| 49 |
+
blocksize: int
|
| 50 |
+
How far to read ahead in numbers of bytes
|
| 51 |
+
fetcher: func
|
| 52 |
+
Function of the form f(start, end) which gets bytes from remote as
|
| 53 |
+
specified
|
| 54 |
+
size: int
|
| 55 |
+
How big this file is
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
name: ClassVar[str] = "none"
|
| 59 |
+
|
| 60 |
+
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
| 61 |
+
self.blocksize = blocksize
|
| 62 |
+
self.nblocks = 0
|
| 63 |
+
self.fetcher = fetcher
|
| 64 |
+
self.size = size
|
| 65 |
+
self.hit_count = 0
|
| 66 |
+
self.miss_count = 0
|
| 67 |
+
# the bytes that we actually requested
|
| 68 |
+
self.total_requested_bytes = 0
|
| 69 |
+
|
| 70 |
+
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
| 71 |
+
if start is None:
|
| 72 |
+
start = 0
|
| 73 |
+
if stop is None:
|
| 74 |
+
stop = self.size
|
| 75 |
+
if start >= self.size or start >= stop:
|
| 76 |
+
return b""
|
| 77 |
+
return self.fetcher(start, stop)
|
| 78 |
+
|
| 79 |
+
def _reset_stats(self) -> None:
|
| 80 |
+
"""Reset hit and miss counts for a more ganular report e.g. by file."""
|
| 81 |
+
self.hit_count = 0
|
| 82 |
+
self.miss_count = 0
|
| 83 |
+
self.total_requested_bytes = 0
|
| 84 |
+
|
| 85 |
+
def _log_stats(self) -> str:
|
| 86 |
+
"""Return a formatted string of the cache statistics."""
|
| 87 |
+
if self.hit_count == 0 and self.miss_count == 0:
|
| 88 |
+
# a cache that does nothing, this is for logs only
|
| 89 |
+
return ""
|
| 90 |
+
return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
|
| 91 |
+
|
| 92 |
+
def __repr__(self) -> str:
|
| 93 |
+
# TODO: use rich for better formatting
|
| 94 |
+
return f"""
|
| 95 |
+
<{self.__class__.__name__}:
|
| 96 |
+
block size : {self.blocksize}
|
| 97 |
+
block count : {self.nblocks}
|
| 98 |
+
file size : {self.size}
|
| 99 |
+
cache hits : {self.hit_count}
|
| 100 |
+
cache misses: {self.miss_count}
|
| 101 |
+
total requested bytes: {self.total_requested_bytes}>
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class MMapCache(BaseCache):
|
| 106 |
+
"""memory-mapped sparse file cache
|
| 107 |
+
|
| 108 |
+
Opens temporary file, which is filled blocks-wise when data is requested.
|
| 109 |
+
Ensure there is enough disc space in the temporary location.
|
| 110 |
+
|
| 111 |
+
This cache method might only work on posix
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
name = "mmap"
|
| 115 |
+
|
| 116 |
+
def __init__(
|
| 117 |
+
self,
|
| 118 |
+
blocksize: int,
|
| 119 |
+
fetcher: Fetcher,
|
| 120 |
+
size: int,
|
| 121 |
+
location: str | None = None,
|
| 122 |
+
blocks: set[int] | None = None,
|
| 123 |
+
) -> None:
|
| 124 |
+
super().__init__(blocksize, fetcher, size)
|
| 125 |
+
self.blocks = set() if blocks is None else blocks
|
| 126 |
+
self.location = location
|
| 127 |
+
self.cache = self._makefile()
|
| 128 |
+
|
| 129 |
+
def _makefile(self) -> mmap.mmap | bytearray:
|
| 130 |
+
import mmap
|
| 131 |
+
import tempfile
|
| 132 |
+
|
| 133 |
+
if self.size == 0:
|
| 134 |
+
return bytearray()
|
| 135 |
+
|
| 136 |
+
# posix version
|
| 137 |
+
if self.location is None or not os.path.exists(self.location):
|
| 138 |
+
if self.location is None:
|
| 139 |
+
fd = tempfile.TemporaryFile()
|
| 140 |
+
self.blocks = set()
|
| 141 |
+
else:
|
| 142 |
+
fd = open(self.location, "wb+")
|
| 143 |
+
fd.seek(self.size - 1)
|
| 144 |
+
fd.write(b"1")
|
| 145 |
+
fd.flush()
|
| 146 |
+
else:
|
| 147 |
+
fd = open(self.location, "r+b")
|
| 148 |
+
|
| 149 |
+
return mmap.mmap(fd.fileno(), self.size)
|
| 150 |
+
|
| 151 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 152 |
+
logger.debug(f"MMap cache fetching {start}-{end}")
|
| 153 |
+
if start is None:
|
| 154 |
+
start = 0
|
| 155 |
+
if end is None:
|
| 156 |
+
end = self.size
|
| 157 |
+
if start >= self.size or start >= end:
|
| 158 |
+
return b""
|
| 159 |
+
start_block = start // self.blocksize
|
| 160 |
+
end_block = end // self.blocksize
|
| 161 |
+
block_range = range(start_block, end_block + 1)
|
| 162 |
+
# Determine which blocks need to be fetched. This sequence is sorted by construction.
|
| 163 |
+
need = (i for i in block_range if i not in self.blocks)
|
| 164 |
+
# Count the number of blocks already cached
|
| 165 |
+
self.hit_count += sum(1 for i in block_range if i in self.blocks)
|
| 166 |
+
|
| 167 |
+
# Consolidate needed blocks.
|
| 168 |
+
# Algorithm adapted from Python 2.x itertools documentation.
|
| 169 |
+
# We are grouping an enumerated sequence of blocks. By comparing when the difference
|
| 170 |
+
# between an ascending range (provided by enumerate) and the needed block numbers
|
| 171 |
+
# we can detect when the block number skips values. The key computes this difference.
|
| 172 |
+
# Whenever the difference changes, we know that we have previously cached block(s),
|
| 173 |
+
# and a new group is started. In other words, this algorithm neatly groups
|
| 174 |
+
# runs of consecutive block numbers so they can be fetched together.
|
| 175 |
+
for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
|
| 176 |
+
# Extract the blocks from the enumerated sequence
|
| 177 |
+
_blocks = tuple(map(itemgetter(1), _blocks))
|
| 178 |
+
# Compute start of first block
|
| 179 |
+
sstart = _blocks[0] * self.blocksize
|
| 180 |
+
# Compute the end of the last block. Last block may not be full size.
|
| 181 |
+
send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
|
| 182 |
+
|
| 183 |
+
# Fetch bytes (could be multiple consecutive blocks)
|
| 184 |
+
self.total_requested_bytes += send - sstart
|
| 185 |
+
logger.debug(
|
| 186 |
+
f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
|
| 187 |
+
)
|
| 188 |
+
self.cache[sstart:send] = self.fetcher(sstart, send)
|
| 189 |
+
|
| 190 |
+
# Update set of cached blocks
|
| 191 |
+
self.blocks.update(_blocks)
|
| 192 |
+
# Update cache statistics with number of blocks we had to cache
|
| 193 |
+
self.miss_count += len(_blocks)
|
| 194 |
+
|
| 195 |
+
return self.cache[start:end]
|
| 196 |
+
|
| 197 |
+
def __getstate__(self) -> dict[str, Any]:
|
| 198 |
+
state = self.__dict__.copy()
|
| 199 |
+
# Remove the unpicklable entries.
|
| 200 |
+
del state["cache"]
|
| 201 |
+
return state
|
| 202 |
+
|
| 203 |
+
def __setstate__(self, state: dict[str, Any]) -> None:
|
| 204 |
+
# Restore instance attributes
|
| 205 |
+
self.__dict__.update(state)
|
| 206 |
+
self.cache = self._makefile()
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
class ReadAheadCache(BaseCache):
|
| 210 |
+
"""Cache which reads only when we get beyond a block of data
|
| 211 |
+
|
| 212 |
+
This is a much simpler version of BytesCache, and does not attempt to
|
| 213 |
+
fill holes in the cache or keep fragments alive. It is best suited to
|
| 214 |
+
many small reads in a sequential order (e.g., reading lines from a file).
|
| 215 |
+
"""
|
| 216 |
+
|
| 217 |
+
name = "readahead"
|
| 218 |
+
|
| 219 |
+
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
| 220 |
+
super().__init__(blocksize, fetcher, size)
|
| 221 |
+
self.cache = b""
|
| 222 |
+
self.start = 0
|
| 223 |
+
self.end = 0
|
| 224 |
+
|
| 225 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 226 |
+
if start is None:
|
| 227 |
+
start = 0
|
| 228 |
+
if end is None or end > self.size:
|
| 229 |
+
end = self.size
|
| 230 |
+
if start >= self.size or start >= end:
|
| 231 |
+
return b""
|
| 232 |
+
l = end - start
|
| 233 |
+
if start >= self.start and end <= self.end:
|
| 234 |
+
# cache hit
|
| 235 |
+
self.hit_count += 1
|
| 236 |
+
return self.cache[start - self.start : end - self.start]
|
| 237 |
+
elif self.start <= start < self.end:
|
| 238 |
+
# partial hit
|
| 239 |
+
self.miss_count += 1
|
| 240 |
+
part = self.cache[start - self.start :]
|
| 241 |
+
l -= len(part)
|
| 242 |
+
start = self.end
|
| 243 |
+
else:
|
| 244 |
+
# miss
|
| 245 |
+
self.miss_count += 1
|
| 246 |
+
part = b""
|
| 247 |
+
end = min(self.size, end + self.blocksize)
|
| 248 |
+
self.total_requested_bytes += end - start
|
| 249 |
+
self.cache = self.fetcher(start, end) # new block replaces old
|
| 250 |
+
self.start = start
|
| 251 |
+
self.end = self.start + len(self.cache)
|
| 252 |
+
return part + self.cache[:l]
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class FirstChunkCache(BaseCache):
|
| 256 |
+
"""Caches the first block of a file only
|
| 257 |
+
|
| 258 |
+
This may be useful for file types where the metadata is stored in the header,
|
| 259 |
+
but is randomly accessed.
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
name = "first"
|
| 263 |
+
|
| 264 |
+
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
| 265 |
+
if blocksize > size:
|
| 266 |
+
# this will buffer the whole thing
|
| 267 |
+
blocksize = size
|
| 268 |
+
super().__init__(blocksize, fetcher, size)
|
| 269 |
+
self.cache: bytes | None = None
|
| 270 |
+
|
| 271 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 272 |
+
start = start or 0
|
| 273 |
+
if start > self.size:
|
| 274 |
+
logger.debug("FirstChunkCache: requested start > file size")
|
| 275 |
+
return b""
|
| 276 |
+
|
| 277 |
+
end = min(end, self.size)
|
| 278 |
+
|
| 279 |
+
if start < self.blocksize:
|
| 280 |
+
if self.cache is None:
|
| 281 |
+
self.miss_count += 1
|
| 282 |
+
if end > self.blocksize:
|
| 283 |
+
self.total_requested_bytes += end
|
| 284 |
+
data = self.fetcher(0, end)
|
| 285 |
+
self.cache = data[: self.blocksize]
|
| 286 |
+
return data[start:]
|
| 287 |
+
self.cache = self.fetcher(0, self.blocksize)
|
| 288 |
+
self.total_requested_bytes += self.blocksize
|
| 289 |
+
part = self.cache[start:end]
|
| 290 |
+
if end > self.blocksize:
|
| 291 |
+
self.total_requested_bytes += end - self.blocksize
|
| 292 |
+
part += self.fetcher(self.blocksize, end)
|
| 293 |
+
self.hit_count += 1
|
| 294 |
+
return part
|
| 295 |
+
else:
|
| 296 |
+
self.miss_count += 1
|
| 297 |
+
self.total_requested_bytes += end - start
|
| 298 |
+
return self.fetcher(start, end)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
class BlockCache(BaseCache):
|
| 302 |
+
"""
|
| 303 |
+
Cache holding memory as a set of blocks.
|
| 304 |
+
|
| 305 |
+
Requests are only ever made ``blocksize`` at a time, and are
|
| 306 |
+
stored in an LRU cache. The least recently accessed block is
|
| 307 |
+
discarded when more than ``maxblocks`` are stored.
|
| 308 |
+
|
| 309 |
+
Parameters
|
| 310 |
+
----------
|
| 311 |
+
blocksize : int
|
| 312 |
+
The number of bytes to store in each block.
|
| 313 |
+
Requests are only ever made for ``blocksize``, so this
|
| 314 |
+
should balance the overhead of making a request against
|
| 315 |
+
the granularity of the blocks.
|
| 316 |
+
fetcher : Callable
|
| 317 |
+
size : int
|
| 318 |
+
The total size of the file being cached.
|
| 319 |
+
maxblocks : int
|
| 320 |
+
The maximum number of blocks to cache for. The maximum memory
|
| 321 |
+
use for this cache is then ``blocksize * maxblocks``.
|
| 322 |
+
"""
|
| 323 |
+
|
| 324 |
+
name = "blockcache"
|
| 325 |
+
|
| 326 |
+
def __init__(
|
| 327 |
+
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
|
| 328 |
+
) -> None:
|
| 329 |
+
super().__init__(blocksize, fetcher, size)
|
| 330 |
+
self.nblocks = math.ceil(size / blocksize)
|
| 331 |
+
self.maxblocks = maxblocks
|
| 332 |
+
self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
|
| 333 |
+
|
| 334 |
+
def cache_info(self):
|
| 335 |
+
"""
|
| 336 |
+
The statistics on the block cache.
|
| 337 |
+
|
| 338 |
+
Returns
|
| 339 |
+
-------
|
| 340 |
+
NamedTuple
|
| 341 |
+
Returned directly from the LRU Cache used internally.
|
| 342 |
+
"""
|
| 343 |
+
return self._fetch_block_cached.cache_info()
|
| 344 |
+
|
| 345 |
+
def __getstate__(self) -> dict[str, Any]:
|
| 346 |
+
state = self.__dict__
|
| 347 |
+
del state["_fetch_block_cached"]
|
| 348 |
+
return state
|
| 349 |
+
|
| 350 |
+
def __setstate__(self, state: dict[str, Any]) -> None:
|
| 351 |
+
self.__dict__.update(state)
|
| 352 |
+
self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
|
| 353 |
+
self._fetch_block
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 357 |
+
if start is None:
|
| 358 |
+
start = 0
|
| 359 |
+
if end is None:
|
| 360 |
+
end = self.size
|
| 361 |
+
if start >= self.size or start >= end:
|
| 362 |
+
return b""
|
| 363 |
+
|
| 364 |
+
# byte position -> block numbers
|
| 365 |
+
start_block_number = start // self.blocksize
|
| 366 |
+
end_block_number = end // self.blocksize
|
| 367 |
+
|
| 368 |
+
# these are cached, so safe to do multiple calls for the same start and end.
|
| 369 |
+
for block_number in range(start_block_number, end_block_number + 1):
|
| 370 |
+
self._fetch_block_cached(block_number)
|
| 371 |
+
|
| 372 |
+
return self._read_cache(
|
| 373 |
+
start,
|
| 374 |
+
end,
|
| 375 |
+
start_block_number=start_block_number,
|
| 376 |
+
end_block_number=end_block_number,
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
def _fetch_block(self, block_number: int) -> bytes:
|
| 380 |
+
"""
|
| 381 |
+
Fetch the block of data for `block_number`.
|
| 382 |
+
"""
|
| 383 |
+
if block_number > self.nblocks:
|
| 384 |
+
raise ValueError(
|
| 385 |
+
f"'block_number={block_number}' is greater than "
|
| 386 |
+
f"the number of blocks ({self.nblocks})"
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
start = block_number * self.blocksize
|
| 390 |
+
end = start + self.blocksize
|
| 391 |
+
self.total_requested_bytes += end - start
|
| 392 |
+
self.miss_count += 1
|
| 393 |
+
logger.info("BlockCache fetching block %d", block_number)
|
| 394 |
+
block_contents = super()._fetch(start, end)
|
| 395 |
+
return block_contents
|
| 396 |
+
|
| 397 |
+
def _read_cache(
|
| 398 |
+
self, start: int, end: int, start_block_number: int, end_block_number: int
|
| 399 |
+
) -> bytes:
|
| 400 |
+
"""
|
| 401 |
+
Read from our block cache.
|
| 402 |
+
|
| 403 |
+
Parameters
|
| 404 |
+
----------
|
| 405 |
+
start, end : int
|
| 406 |
+
The start and end byte positions.
|
| 407 |
+
start_block_number, end_block_number : int
|
| 408 |
+
The start and end block numbers.
|
| 409 |
+
"""
|
| 410 |
+
start_pos = start % self.blocksize
|
| 411 |
+
end_pos = end % self.blocksize
|
| 412 |
+
|
| 413 |
+
self.hit_count += 1
|
| 414 |
+
if start_block_number == end_block_number:
|
| 415 |
+
block: bytes = self._fetch_block_cached(start_block_number)
|
| 416 |
+
return block[start_pos:end_pos]
|
| 417 |
+
|
| 418 |
+
else:
|
| 419 |
+
# read from the initial
|
| 420 |
+
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
|
| 421 |
+
|
| 422 |
+
# intermediate blocks
|
| 423 |
+
# Note: it'd be nice to combine these into one big request. However
|
| 424 |
+
# that doesn't play nicely with our LRU cache.
|
| 425 |
+
out.extend(
|
| 426 |
+
map(
|
| 427 |
+
self._fetch_block_cached,
|
| 428 |
+
range(start_block_number + 1, end_block_number),
|
| 429 |
+
)
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
# final block
|
| 433 |
+
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
|
| 434 |
+
|
| 435 |
+
return b"".join(out)
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
class BytesCache(BaseCache):
|
| 439 |
+
"""Cache which holds data in a in-memory bytes object
|
| 440 |
+
|
| 441 |
+
Implements read-ahead by the block size, for semi-random reads progressing
|
| 442 |
+
through the file.
|
| 443 |
+
|
| 444 |
+
Parameters
|
| 445 |
+
----------
|
| 446 |
+
trim: bool
|
| 447 |
+
As we read more data, whether to discard the start of the buffer when
|
| 448 |
+
we are more than a blocksize ahead of it.
|
| 449 |
+
"""
|
| 450 |
+
|
| 451 |
+
name: ClassVar[str] = "bytes"
|
| 452 |
+
|
| 453 |
+
def __init__(
|
| 454 |
+
self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
|
| 455 |
+
) -> None:
|
| 456 |
+
super().__init__(blocksize, fetcher, size)
|
| 457 |
+
self.cache = b""
|
| 458 |
+
self.start: int | None = None
|
| 459 |
+
self.end: int | None = None
|
| 460 |
+
self.trim = trim
|
| 461 |
+
|
| 462 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 463 |
+
# TODO: only set start/end after fetch, in case it fails?
|
| 464 |
+
# is this where retry logic might go?
|
| 465 |
+
if start is None:
|
| 466 |
+
start = 0
|
| 467 |
+
if end is None:
|
| 468 |
+
end = self.size
|
| 469 |
+
if start >= self.size or start >= end:
|
| 470 |
+
return b""
|
| 471 |
+
if (
|
| 472 |
+
self.start is not None
|
| 473 |
+
and start >= self.start
|
| 474 |
+
and self.end is not None
|
| 475 |
+
and end < self.end
|
| 476 |
+
):
|
| 477 |
+
# cache hit: we have all the required data
|
| 478 |
+
offset = start - self.start
|
| 479 |
+
self.hit_count += 1
|
| 480 |
+
return self.cache[offset : offset + end - start]
|
| 481 |
+
|
| 482 |
+
if self.blocksize:
|
| 483 |
+
bend = min(self.size, end + self.blocksize)
|
| 484 |
+
else:
|
| 485 |
+
bend = end
|
| 486 |
+
|
| 487 |
+
if bend == start or start > self.size:
|
| 488 |
+
return b""
|
| 489 |
+
|
| 490 |
+
if (self.start is None or start < self.start) and (
|
| 491 |
+
self.end is None or end > self.end
|
| 492 |
+
):
|
| 493 |
+
# First read, or extending both before and after
|
| 494 |
+
self.total_requested_bytes += bend - start
|
| 495 |
+
self.miss_count += 1
|
| 496 |
+
self.cache = self.fetcher(start, bend)
|
| 497 |
+
self.start = start
|
| 498 |
+
else:
|
| 499 |
+
assert self.start is not None
|
| 500 |
+
assert self.end is not None
|
| 501 |
+
self.miss_count += 1
|
| 502 |
+
|
| 503 |
+
if start < self.start:
|
| 504 |
+
if self.end is None or self.end - end > self.blocksize:
|
| 505 |
+
self.total_requested_bytes += bend - start
|
| 506 |
+
self.cache = self.fetcher(start, bend)
|
| 507 |
+
self.start = start
|
| 508 |
+
else:
|
| 509 |
+
self.total_requested_bytes += self.start - start
|
| 510 |
+
new = self.fetcher(start, self.start)
|
| 511 |
+
self.start = start
|
| 512 |
+
self.cache = new + self.cache
|
| 513 |
+
elif self.end is not None and bend > self.end:
|
| 514 |
+
if self.end > self.size:
|
| 515 |
+
pass
|
| 516 |
+
elif end - self.end > self.blocksize:
|
| 517 |
+
self.total_requested_bytes += bend - start
|
| 518 |
+
self.cache = self.fetcher(start, bend)
|
| 519 |
+
self.start = start
|
| 520 |
+
else:
|
| 521 |
+
self.total_requested_bytes += bend - self.end
|
| 522 |
+
new = self.fetcher(self.end, bend)
|
| 523 |
+
self.cache = self.cache + new
|
| 524 |
+
|
| 525 |
+
self.end = self.start + len(self.cache)
|
| 526 |
+
offset = start - self.start
|
| 527 |
+
out = self.cache[offset : offset + end - start]
|
| 528 |
+
if self.trim:
|
| 529 |
+
num = (self.end - self.start) // (self.blocksize + 1)
|
| 530 |
+
if num > 1:
|
| 531 |
+
self.start += self.blocksize * num
|
| 532 |
+
self.cache = self.cache[self.blocksize * num :]
|
| 533 |
+
return out
|
| 534 |
+
|
| 535 |
+
def __len__(self) -> int:
|
| 536 |
+
return len(self.cache)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
class AllBytes(BaseCache):
|
| 540 |
+
"""Cache entire contents of the file"""
|
| 541 |
+
|
| 542 |
+
name: ClassVar[str] = "all"
|
| 543 |
+
|
| 544 |
+
def __init__(
|
| 545 |
+
self,
|
| 546 |
+
blocksize: int | None = None,
|
| 547 |
+
fetcher: Fetcher | None = None,
|
| 548 |
+
size: int | None = None,
|
| 549 |
+
data: bytes | None = None,
|
| 550 |
+
) -> None:
|
| 551 |
+
super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
|
| 552 |
+
if data is None:
|
| 553 |
+
self.miss_count += 1
|
| 554 |
+
self.total_requested_bytes += self.size
|
| 555 |
+
data = self.fetcher(0, self.size)
|
| 556 |
+
self.data = data
|
| 557 |
+
|
| 558 |
+
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
| 559 |
+
self.hit_count += 1
|
| 560 |
+
return self.data[start:stop]
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
class KnownPartsOfAFile(BaseCache):
|
| 564 |
+
"""
|
| 565 |
+
Cache holding known file parts.
|
| 566 |
+
|
| 567 |
+
Parameters
|
| 568 |
+
----------
|
| 569 |
+
blocksize: int
|
| 570 |
+
How far to read ahead in numbers of bytes
|
| 571 |
+
fetcher: func
|
| 572 |
+
Function of the form f(start, end) which gets bytes from remote as
|
| 573 |
+
specified
|
| 574 |
+
size: int
|
| 575 |
+
How big this file is
|
| 576 |
+
data: dict
|
| 577 |
+
A dictionary mapping explicit `(start, stop)` file-offset tuples
|
| 578 |
+
with known bytes.
|
| 579 |
+
strict: bool, default True
|
| 580 |
+
Whether to fetch reads that go beyond a known byte-range boundary.
|
| 581 |
+
If `False`, any read that ends outside a known part will be zero
|
| 582 |
+
padded. Note that zero padding will not be used for reads that
|
| 583 |
+
begin outside a known byte-range.
|
| 584 |
+
"""
|
| 585 |
+
|
| 586 |
+
name: ClassVar[str] = "parts"
|
| 587 |
+
|
| 588 |
+
def __init__(
|
| 589 |
+
self,
|
| 590 |
+
blocksize: int,
|
| 591 |
+
fetcher: Fetcher,
|
| 592 |
+
size: int,
|
| 593 |
+
data: Optional[dict[tuple[int, int], bytes]] = None,
|
| 594 |
+
strict: bool = True,
|
| 595 |
+
**_: Any,
|
| 596 |
+
):
|
| 597 |
+
super().__init__(blocksize, fetcher, size)
|
| 598 |
+
self.strict = strict
|
| 599 |
+
|
| 600 |
+
# simple consolidation of contiguous blocks
|
| 601 |
+
if data:
|
| 602 |
+
old_offsets = sorted(data.keys())
|
| 603 |
+
offsets = [old_offsets[0]]
|
| 604 |
+
blocks = [data.pop(old_offsets[0])]
|
| 605 |
+
for start, stop in old_offsets[1:]:
|
| 606 |
+
start0, stop0 = offsets[-1]
|
| 607 |
+
if start == stop0:
|
| 608 |
+
offsets[-1] = (start0, stop)
|
| 609 |
+
blocks[-1] += data.pop((start, stop))
|
| 610 |
+
else:
|
| 611 |
+
offsets.append((start, stop))
|
| 612 |
+
blocks.append(data.pop((start, stop)))
|
| 613 |
+
|
| 614 |
+
self.data = dict(zip(offsets, blocks))
|
| 615 |
+
else:
|
| 616 |
+
self.data = {}
|
| 617 |
+
|
| 618 |
+
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
| 619 |
+
if start is None:
|
| 620 |
+
start = 0
|
| 621 |
+
if stop is None:
|
| 622 |
+
stop = self.size
|
| 623 |
+
|
| 624 |
+
out = b""
|
| 625 |
+
for (loc0, loc1), data in self.data.items():
|
| 626 |
+
# If self.strict=False, use zero-padded data
|
| 627 |
+
# for reads beyond the end of a "known" buffer
|
| 628 |
+
if loc0 <= start < loc1:
|
| 629 |
+
off = start - loc0
|
| 630 |
+
out = data[off : off + stop - start]
|
| 631 |
+
if not self.strict or loc0 <= stop <= loc1:
|
| 632 |
+
# The request is within a known range, or
|
| 633 |
+
# it begins within a known range, and we
|
| 634 |
+
# are allowed to pad reads beyond the
|
| 635 |
+
# buffer with zero
|
| 636 |
+
out += b"\x00" * (stop - start - len(out))
|
| 637 |
+
self.hit_count += 1
|
| 638 |
+
return out
|
| 639 |
+
else:
|
| 640 |
+
# The request ends outside a known range,
|
| 641 |
+
# and we are being "strict" about reads
|
| 642 |
+
# beyond the buffer
|
| 643 |
+
start = loc1
|
| 644 |
+
break
|
| 645 |
+
|
| 646 |
+
# We only get here if there is a request outside the
|
| 647 |
+
# known parts of the file. In an ideal world, this
|
| 648 |
+
# should never happen
|
| 649 |
+
if self.fetcher is None:
|
| 650 |
+
# We cannot fetch the data, so raise an error
|
| 651 |
+
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
|
| 652 |
+
# We can fetch the data, but should warn the user
|
| 653 |
+
# that this may be slow
|
| 654 |
+
warnings.warn(
|
| 655 |
+
f"Read is outside the known file parts: {(start, stop)}. "
|
| 656 |
+
f"IO/caching performance may be poor!"
|
| 657 |
+
)
|
| 658 |
+
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
| 659 |
+
self.total_requested_bytes += stop - start
|
| 660 |
+
self.miss_count += 1
|
| 661 |
+
return out + super()._fetch(start, stop)
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
class UpdatableLRU(Generic[P, T]):
|
| 665 |
+
"""
|
| 666 |
+
Custom implementation of LRU cache that allows updating keys
|
| 667 |
+
|
| 668 |
+
Used by BackgroudBlockCache
|
| 669 |
+
"""
|
| 670 |
+
|
| 671 |
+
class CacheInfo(NamedTuple):
|
| 672 |
+
hits: int
|
| 673 |
+
misses: int
|
| 674 |
+
maxsize: int
|
| 675 |
+
currsize: int
|
| 676 |
+
|
| 677 |
+
def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
|
| 678 |
+
self._cache: OrderedDict[Any, T] = collections.OrderedDict()
|
| 679 |
+
self._func = func
|
| 680 |
+
self._max_size = max_size
|
| 681 |
+
self._hits = 0
|
| 682 |
+
self._misses = 0
|
| 683 |
+
self._lock = threading.Lock()
|
| 684 |
+
|
| 685 |
+
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
|
| 686 |
+
if kwargs:
|
| 687 |
+
raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
|
| 688 |
+
with self._lock:
|
| 689 |
+
if args in self._cache:
|
| 690 |
+
self._cache.move_to_end(args)
|
| 691 |
+
self._hits += 1
|
| 692 |
+
return self._cache[args]
|
| 693 |
+
|
| 694 |
+
result = self._func(*args, **kwargs)
|
| 695 |
+
|
| 696 |
+
with self._lock:
|
| 697 |
+
self._cache[args] = result
|
| 698 |
+
self._misses += 1
|
| 699 |
+
if len(self._cache) > self._max_size:
|
| 700 |
+
self._cache.popitem(last=False)
|
| 701 |
+
|
| 702 |
+
return result
|
| 703 |
+
|
| 704 |
+
def is_key_cached(self, *args: Any) -> bool:
|
| 705 |
+
with self._lock:
|
| 706 |
+
return args in self._cache
|
| 707 |
+
|
| 708 |
+
def add_key(self, result: T, *args: Any) -> None:
|
| 709 |
+
with self._lock:
|
| 710 |
+
self._cache[args] = result
|
| 711 |
+
if len(self._cache) > self._max_size:
|
| 712 |
+
self._cache.popitem(last=False)
|
| 713 |
+
|
| 714 |
+
def cache_info(self) -> UpdatableLRU.CacheInfo:
|
| 715 |
+
with self._lock:
|
| 716 |
+
return self.CacheInfo(
|
| 717 |
+
maxsize=self._max_size,
|
| 718 |
+
currsize=len(self._cache),
|
| 719 |
+
hits=self._hits,
|
| 720 |
+
misses=self._misses,
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
class BackgroundBlockCache(BaseCache):
|
| 725 |
+
"""
|
| 726 |
+
Cache holding memory as a set of blocks with pre-loading of
|
| 727 |
+
the next block in the background.
|
| 728 |
+
|
| 729 |
+
Requests are only ever made ``blocksize`` at a time, and are
|
| 730 |
+
stored in an LRU cache. The least recently accessed block is
|
| 731 |
+
discarded when more than ``maxblocks`` are stored. If the
|
| 732 |
+
next block is not in cache, it is loaded in a separate thread
|
| 733 |
+
in non-blocking way.
|
| 734 |
+
|
| 735 |
+
Parameters
|
| 736 |
+
----------
|
| 737 |
+
blocksize : int
|
| 738 |
+
The number of bytes to store in each block.
|
| 739 |
+
Requests are only ever made for ``blocksize``, so this
|
| 740 |
+
should balance the overhead of making a request against
|
| 741 |
+
the granularity of the blocks.
|
| 742 |
+
fetcher : Callable
|
| 743 |
+
size : int
|
| 744 |
+
The total size of the file being cached.
|
| 745 |
+
maxblocks : int
|
| 746 |
+
The maximum number of blocks to cache for. The maximum memory
|
| 747 |
+
use for this cache is then ``blocksize * maxblocks``.
|
| 748 |
+
"""
|
| 749 |
+
|
| 750 |
+
name: ClassVar[str] = "background"
|
| 751 |
+
|
| 752 |
+
def __init__(
|
| 753 |
+
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
|
| 754 |
+
) -> None:
|
| 755 |
+
super().__init__(blocksize, fetcher, size)
|
| 756 |
+
self.nblocks = math.ceil(size / blocksize)
|
| 757 |
+
self.maxblocks = maxblocks
|
| 758 |
+
self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
|
| 759 |
+
|
| 760 |
+
self._thread_executor = ThreadPoolExecutor(max_workers=1)
|
| 761 |
+
self._fetch_future_block_number: int | None = None
|
| 762 |
+
self._fetch_future: Future[bytes] | None = None
|
| 763 |
+
self._fetch_future_lock = threading.Lock()
|
| 764 |
+
|
| 765 |
+
def cache_info(self) -> UpdatableLRU.CacheInfo:
|
| 766 |
+
"""
|
| 767 |
+
The statistics on the block cache.
|
| 768 |
+
|
| 769 |
+
Returns
|
| 770 |
+
-------
|
| 771 |
+
NamedTuple
|
| 772 |
+
Returned directly from the LRU Cache used internally.
|
| 773 |
+
"""
|
| 774 |
+
return self._fetch_block_cached.cache_info()
|
| 775 |
+
|
| 776 |
+
def __getstate__(self) -> dict[str, Any]:
|
| 777 |
+
state = self.__dict__
|
| 778 |
+
del state["_fetch_block_cached"]
|
| 779 |
+
del state["_thread_executor"]
|
| 780 |
+
del state["_fetch_future_block_number"]
|
| 781 |
+
del state["_fetch_future"]
|
| 782 |
+
del state["_fetch_future_lock"]
|
| 783 |
+
return state
|
| 784 |
+
|
| 785 |
+
def __setstate__(self, state) -> None:
|
| 786 |
+
self.__dict__.update(state)
|
| 787 |
+
self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
|
| 788 |
+
self._thread_executor = ThreadPoolExecutor(max_workers=1)
|
| 789 |
+
self._fetch_future_block_number = None
|
| 790 |
+
self._fetch_future = None
|
| 791 |
+
self._fetch_future_lock = threading.Lock()
|
| 792 |
+
|
| 793 |
+
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
| 794 |
+
if start is None:
|
| 795 |
+
start = 0
|
| 796 |
+
if end is None:
|
| 797 |
+
end = self.size
|
| 798 |
+
if start >= self.size or start >= end:
|
| 799 |
+
return b""
|
| 800 |
+
|
| 801 |
+
# byte position -> block numbers
|
| 802 |
+
start_block_number = start // self.blocksize
|
| 803 |
+
end_block_number = end // self.blocksize
|
| 804 |
+
|
| 805 |
+
fetch_future_block_number = None
|
| 806 |
+
fetch_future = None
|
| 807 |
+
with self._fetch_future_lock:
|
| 808 |
+
# Background thread is running. Check we we can or must join it.
|
| 809 |
+
if self._fetch_future is not None:
|
| 810 |
+
assert self._fetch_future_block_number is not None
|
| 811 |
+
if self._fetch_future.done():
|
| 812 |
+
logger.info("BlockCache joined background fetch without waiting.")
|
| 813 |
+
self._fetch_block_cached.add_key(
|
| 814 |
+
self._fetch_future.result(), self._fetch_future_block_number
|
| 815 |
+
)
|
| 816 |
+
# Cleanup the fetch variables. Done with fetching the block.
|
| 817 |
+
self._fetch_future_block_number = None
|
| 818 |
+
self._fetch_future = None
|
| 819 |
+
else:
|
| 820 |
+
# Must join if we need the block for the current fetch
|
| 821 |
+
must_join = bool(
|
| 822 |
+
start_block_number
|
| 823 |
+
<= self._fetch_future_block_number
|
| 824 |
+
<= end_block_number
|
| 825 |
+
)
|
| 826 |
+
if must_join:
|
| 827 |
+
# Copy to the local variables to release lock
|
| 828 |
+
# before waiting for result
|
| 829 |
+
fetch_future_block_number = self._fetch_future_block_number
|
| 830 |
+
fetch_future = self._fetch_future
|
| 831 |
+
|
| 832 |
+
# Cleanup the fetch variables. Have a local copy.
|
| 833 |
+
self._fetch_future_block_number = None
|
| 834 |
+
self._fetch_future = None
|
| 835 |
+
|
| 836 |
+
# Need to wait for the future for the current read
|
| 837 |
+
if fetch_future is not None:
|
| 838 |
+
logger.info("BlockCache waiting for background fetch.")
|
| 839 |
+
# Wait until result and put it in cache
|
| 840 |
+
self._fetch_block_cached.add_key(
|
| 841 |
+
fetch_future.result(), fetch_future_block_number
|
| 842 |
+
)
|
| 843 |
+
|
| 844 |
+
# these are cached, so safe to do multiple calls for the same start and end.
|
| 845 |
+
for block_number in range(start_block_number, end_block_number + 1):
|
| 846 |
+
self._fetch_block_cached(block_number)
|
| 847 |
+
|
| 848 |
+
# fetch next block in the background if nothing is running in the background,
|
| 849 |
+
# the block is within file and it is not already cached
|
| 850 |
+
end_block_plus_1 = end_block_number + 1
|
| 851 |
+
with self._fetch_future_lock:
|
| 852 |
+
if (
|
| 853 |
+
self._fetch_future is None
|
| 854 |
+
and end_block_plus_1 <= self.nblocks
|
| 855 |
+
and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
|
| 856 |
+
):
|
| 857 |
+
self._fetch_future_block_number = end_block_plus_1
|
| 858 |
+
self._fetch_future = self._thread_executor.submit(
|
| 859 |
+
self._fetch_block, end_block_plus_1, "async"
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
return self._read_cache(
|
| 863 |
+
start,
|
| 864 |
+
end,
|
| 865 |
+
start_block_number=start_block_number,
|
| 866 |
+
end_block_number=end_block_number,
|
| 867 |
+
)
|
| 868 |
+
|
| 869 |
+
def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
|
| 870 |
+
"""
|
| 871 |
+
Fetch the block of data for `block_number`.
|
| 872 |
+
"""
|
| 873 |
+
if block_number > self.nblocks:
|
| 874 |
+
raise ValueError(
|
| 875 |
+
f"'block_number={block_number}' is greater than "
|
| 876 |
+
f"the number of blocks ({self.nblocks})"
|
| 877 |
+
)
|
| 878 |
+
|
| 879 |
+
start = block_number * self.blocksize
|
| 880 |
+
end = start + self.blocksize
|
| 881 |
+
logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
|
| 882 |
+
self.total_requested_bytes += end - start
|
| 883 |
+
self.miss_count += 1
|
| 884 |
+
block_contents = super()._fetch(start, end)
|
| 885 |
+
return block_contents
|
| 886 |
+
|
| 887 |
+
def _read_cache(
|
| 888 |
+
self, start: int, end: int, start_block_number: int, end_block_number: int
|
| 889 |
+
) -> bytes:
|
| 890 |
+
"""
|
| 891 |
+
Read from our block cache.
|
| 892 |
+
|
| 893 |
+
Parameters
|
| 894 |
+
----------
|
| 895 |
+
start, end : int
|
| 896 |
+
The start and end byte positions.
|
| 897 |
+
start_block_number, end_block_number : int
|
| 898 |
+
The start and end block numbers.
|
| 899 |
+
"""
|
| 900 |
+
start_pos = start % self.blocksize
|
| 901 |
+
end_pos = end % self.blocksize
|
| 902 |
+
|
| 903 |
+
# kind of pointless to count this as a hit, but it is
|
| 904 |
+
self.hit_count += 1
|
| 905 |
+
|
| 906 |
+
if start_block_number == end_block_number:
|
| 907 |
+
block = self._fetch_block_cached(start_block_number)
|
| 908 |
+
return block[start_pos:end_pos]
|
| 909 |
+
|
| 910 |
+
else:
|
| 911 |
+
# read from the initial
|
| 912 |
+
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
|
| 913 |
+
|
| 914 |
+
# intermediate blocks
|
| 915 |
+
# Note: it'd be nice to combine these into one big request. However
|
| 916 |
+
# that doesn't play nicely with our LRU cache.
|
| 917 |
+
out.extend(
|
| 918 |
+
map(
|
| 919 |
+
self._fetch_block_cached,
|
| 920 |
+
range(start_block_number + 1, end_block_number),
|
| 921 |
+
)
|
| 922 |
+
)
|
| 923 |
+
|
| 924 |
+
# final block
|
| 925 |
+
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
|
| 926 |
+
|
| 927 |
+
return b"".join(out)
|
| 928 |
+
|
| 929 |
+
|
| 930 |
+
caches: dict[str | None, type[BaseCache]] = {
|
| 931 |
+
# one custom case
|
| 932 |
+
None: BaseCache,
|
| 933 |
+
}
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
|
| 937 |
+
"""'Register' cache implementation.
|
| 938 |
+
|
| 939 |
+
Parameters
|
| 940 |
+
----------
|
| 941 |
+
clobber: bool, optional
|
| 942 |
+
If set to True (default is False) - allow to overwrite existing
|
| 943 |
+
entry.
|
| 944 |
+
|
| 945 |
+
Raises
|
| 946 |
+
------
|
| 947 |
+
ValueError
|
| 948 |
+
"""
|
| 949 |
+
name = cls.name
|
| 950 |
+
if not clobber and name in caches:
|
| 951 |
+
raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
|
| 952 |
+
caches[name] = cls
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
for c in (
|
| 956 |
+
BaseCache,
|
| 957 |
+
MMapCache,
|
| 958 |
+
BytesCache,
|
| 959 |
+
ReadAheadCache,
|
| 960 |
+
BlockCache,
|
| 961 |
+
FirstChunkCache,
|
| 962 |
+
AllBytes,
|
| 963 |
+
KnownPartsOfAFile,
|
| 964 |
+
BackgroundBlockCache,
|
| 965 |
+
):
|
| 966 |
+
register_cache(c)
|
.venv/lib/python3.11/site-packages/fsspec/callbacks.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import wraps
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Callback:
|
| 5 |
+
"""
|
| 6 |
+
Base class and interface for callback mechanism
|
| 7 |
+
|
| 8 |
+
This class can be used directly for monitoring file transfers by
|
| 9 |
+
providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
|
| 10 |
+
below), or subclassed for more specialised behaviour.
|
| 11 |
+
|
| 12 |
+
Parameters
|
| 13 |
+
----------
|
| 14 |
+
size: int (optional)
|
| 15 |
+
Nominal quantity for the value that corresponds to a complete
|
| 16 |
+
transfer, e.g., total number of tiles or total number of
|
| 17 |
+
bytes
|
| 18 |
+
value: int (0)
|
| 19 |
+
Starting internal counter value
|
| 20 |
+
hooks: dict or None
|
| 21 |
+
A dict of named functions to be called on each update. The signature
|
| 22 |
+
of these must be ``f(size, value, **kwargs)``
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, size=None, value=0, hooks=None, **kwargs):
|
| 26 |
+
self.size = size
|
| 27 |
+
self.value = value
|
| 28 |
+
self.hooks = hooks or {}
|
| 29 |
+
self.kw = kwargs
|
| 30 |
+
|
| 31 |
+
def __enter__(self):
|
| 32 |
+
return self
|
| 33 |
+
|
| 34 |
+
def __exit__(self, *exc_args):
|
| 35 |
+
self.close()
|
| 36 |
+
|
| 37 |
+
def close(self):
|
| 38 |
+
"""Close callback."""
|
| 39 |
+
|
| 40 |
+
def branched(self, path_1, path_2, **kwargs):
|
| 41 |
+
"""
|
| 42 |
+
Return callback for child transfers
|
| 43 |
+
|
| 44 |
+
If this callback is operating at a higher level, e.g., put, which may
|
| 45 |
+
trigger transfers that can also be monitored. The function returns a callback
|
| 46 |
+
that has to be passed to the child method, e.g., put_file,
|
| 47 |
+
as `callback=` argument.
|
| 48 |
+
|
| 49 |
+
The implementation uses `callback.branch` for compatibility.
|
| 50 |
+
When implementing callbacks, it is recommended to override this function instead
|
| 51 |
+
of `branch` and avoid calling `super().branched(...)`.
|
| 52 |
+
|
| 53 |
+
Prefer using this function over `branch`.
|
| 54 |
+
|
| 55 |
+
Parameters
|
| 56 |
+
----------
|
| 57 |
+
path_1: str
|
| 58 |
+
Child's source path
|
| 59 |
+
path_2: str
|
| 60 |
+
Child's destination path
|
| 61 |
+
**kwargs:
|
| 62 |
+
Arbitrary keyword arguments
|
| 63 |
+
|
| 64 |
+
Returns
|
| 65 |
+
-------
|
| 66 |
+
callback: Callback
|
| 67 |
+
A callback instance to be passed to the child method
|
| 68 |
+
"""
|
| 69 |
+
self.branch(path_1, path_2, kwargs)
|
| 70 |
+
# mutate kwargs so that we can force the caller to pass "callback=" explicitly
|
| 71 |
+
return kwargs.pop("callback", DEFAULT_CALLBACK)
|
| 72 |
+
|
| 73 |
+
def branch_coro(self, fn):
|
| 74 |
+
"""
|
| 75 |
+
Wraps a coroutine, and pass a new child callback to it.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
@wraps(fn)
|
| 79 |
+
async def func(path1, path2: str, **kwargs):
|
| 80 |
+
with self.branched(path1, path2, **kwargs) as child:
|
| 81 |
+
return await fn(path1, path2, callback=child, **kwargs)
|
| 82 |
+
|
| 83 |
+
return func
|
| 84 |
+
|
| 85 |
+
def set_size(self, size):
|
| 86 |
+
"""
|
| 87 |
+
Set the internal maximum size attribute
|
| 88 |
+
|
| 89 |
+
Usually called if not initially set at instantiation. Note that this
|
| 90 |
+
triggers a ``call()``.
|
| 91 |
+
|
| 92 |
+
Parameters
|
| 93 |
+
----------
|
| 94 |
+
size: int
|
| 95 |
+
"""
|
| 96 |
+
self.size = size
|
| 97 |
+
self.call()
|
| 98 |
+
|
| 99 |
+
def absolute_update(self, value):
|
| 100 |
+
"""
|
| 101 |
+
Set the internal value state
|
| 102 |
+
|
| 103 |
+
Triggers ``call()``
|
| 104 |
+
|
| 105 |
+
Parameters
|
| 106 |
+
----------
|
| 107 |
+
value: int
|
| 108 |
+
"""
|
| 109 |
+
self.value = value
|
| 110 |
+
self.call()
|
| 111 |
+
|
| 112 |
+
def relative_update(self, inc=1):
|
| 113 |
+
"""
|
| 114 |
+
Delta increment the internal counter
|
| 115 |
+
|
| 116 |
+
Triggers ``call()``
|
| 117 |
+
|
| 118 |
+
Parameters
|
| 119 |
+
----------
|
| 120 |
+
inc: int
|
| 121 |
+
"""
|
| 122 |
+
self.value += inc
|
| 123 |
+
self.call()
|
| 124 |
+
|
| 125 |
+
def call(self, hook_name=None, **kwargs):
|
| 126 |
+
"""
|
| 127 |
+
Execute hook(s) with current state
|
| 128 |
+
|
| 129 |
+
Each function is passed the internal size and current value
|
| 130 |
+
|
| 131 |
+
Parameters
|
| 132 |
+
----------
|
| 133 |
+
hook_name: str or None
|
| 134 |
+
If given, execute on this hook
|
| 135 |
+
kwargs: passed on to (all) hook(s)
|
| 136 |
+
"""
|
| 137 |
+
if not self.hooks:
|
| 138 |
+
return
|
| 139 |
+
kw = self.kw.copy()
|
| 140 |
+
kw.update(kwargs)
|
| 141 |
+
if hook_name:
|
| 142 |
+
if hook_name not in self.hooks:
|
| 143 |
+
return
|
| 144 |
+
return self.hooks[hook_name](self.size, self.value, **kw)
|
| 145 |
+
for hook in self.hooks.values() or []:
|
| 146 |
+
hook(self.size, self.value, **kw)
|
| 147 |
+
|
| 148 |
+
def wrap(self, iterable):
|
| 149 |
+
"""
|
| 150 |
+
Wrap an iterable to call ``relative_update`` on each iterations
|
| 151 |
+
|
| 152 |
+
Parameters
|
| 153 |
+
----------
|
| 154 |
+
iterable: Iterable
|
| 155 |
+
The iterable that is being wrapped
|
| 156 |
+
"""
|
| 157 |
+
for item in iterable:
|
| 158 |
+
self.relative_update()
|
| 159 |
+
yield item
|
| 160 |
+
|
| 161 |
+
def branch(self, path_1, path_2, kwargs):
|
| 162 |
+
"""
|
| 163 |
+
Set callbacks for child transfers
|
| 164 |
+
|
| 165 |
+
If this callback is operating at a higher level, e.g., put, which may
|
| 166 |
+
trigger transfers that can also be monitored. The passed kwargs are
|
| 167 |
+
to be *mutated* to add ``callback=``, if this class supports branching
|
| 168 |
+
to children.
|
| 169 |
+
|
| 170 |
+
Parameters
|
| 171 |
+
----------
|
| 172 |
+
path_1: str
|
| 173 |
+
Child's source path
|
| 174 |
+
path_2: str
|
| 175 |
+
Child's destination path
|
| 176 |
+
kwargs: dict
|
| 177 |
+
arguments passed to child method, e.g., put_file.
|
| 178 |
+
|
| 179 |
+
Returns
|
| 180 |
+
-------
|
| 181 |
+
|
| 182 |
+
"""
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
def no_op(self, *_, **__):
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
def __getattr__(self, item):
|
| 189 |
+
"""
|
| 190 |
+
If undefined methods are called on this class, nothing happens
|
| 191 |
+
"""
|
| 192 |
+
return self.no_op
|
| 193 |
+
|
| 194 |
+
@classmethod
|
| 195 |
+
def as_callback(cls, maybe_callback=None):
|
| 196 |
+
"""Transform callback=... into Callback instance
|
| 197 |
+
|
| 198 |
+
For the special value of ``None``, return the global instance of
|
| 199 |
+
``NoOpCallback``. This is an alternative to including
|
| 200 |
+
``callback=DEFAULT_CALLBACK`` directly in a method signature.
|
| 201 |
+
"""
|
| 202 |
+
if maybe_callback is None:
|
| 203 |
+
return DEFAULT_CALLBACK
|
| 204 |
+
return maybe_callback
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class NoOpCallback(Callback):
|
| 208 |
+
"""
|
| 209 |
+
This implementation of Callback does exactly nothing
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
def call(self, *args, **kwargs):
|
| 213 |
+
return None
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class DotPrinterCallback(Callback):
|
| 217 |
+
"""
|
| 218 |
+
Simple example Callback implementation
|
| 219 |
+
|
| 220 |
+
Almost identical to Callback with a hook that prints a char; here we
|
| 221 |
+
demonstrate how the outer layer may print "#" and the inner layer "."
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
def __init__(self, chr_to_print="#", **kwargs):
|
| 225 |
+
self.chr = chr_to_print
|
| 226 |
+
super().__init__(**kwargs)
|
| 227 |
+
|
| 228 |
+
def branch(self, path_1, path_2, kwargs):
|
| 229 |
+
"""Mutate kwargs to add new instance with different print char"""
|
| 230 |
+
kwargs["callback"] = DotPrinterCallback(".")
|
| 231 |
+
|
| 232 |
+
def call(self, **kwargs):
|
| 233 |
+
"""Just outputs a character"""
|
| 234 |
+
print(self.chr, end="")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
class TqdmCallback(Callback):
|
| 238 |
+
"""
|
| 239 |
+
A callback to display a progress bar using tqdm
|
| 240 |
+
|
| 241 |
+
Parameters
|
| 242 |
+
----------
|
| 243 |
+
tqdm_kwargs : dict, (optional)
|
| 244 |
+
Any argument accepted by the tqdm constructor.
|
| 245 |
+
See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
|
| 246 |
+
Will be forwarded to `tqdm_cls`.
|
| 247 |
+
tqdm_cls: (optional)
|
| 248 |
+
subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
|
| 249 |
+
|
| 250 |
+
Examples
|
| 251 |
+
--------
|
| 252 |
+
>>> import fsspec
|
| 253 |
+
>>> from fsspec.callbacks import TqdmCallback
|
| 254 |
+
>>> fs = fsspec.filesystem("memory")
|
| 255 |
+
>>> path2distant_data = "/your-path"
|
| 256 |
+
>>> fs.upload(
|
| 257 |
+
".",
|
| 258 |
+
path2distant_data,
|
| 259 |
+
recursive=True,
|
| 260 |
+
callback=TqdmCallback(),
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
|
| 264 |
+
|
| 265 |
+
>>> fs.upload(
|
| 266 |
+
".",
|
| 267 |
+
path2distant_data,
|
| 268 |
+
recursive=True,
|
| 269 |
+
callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
You can also customize the progress bar by passing a subclass of `tqdm`.
|
| 273 |
+
|
| 274 |
+
.. code-block:: python
|
| 275 |
+
|
| 276 |
+
class TqdmFormat(tqdm):
|
| 277 |
+
'''Provides a `total_time` format parameter'''
|
| 278 |
+
@property
|
| 279 |
+
def format_dict(self):
|
| 280 |
+
d = super().format_dict
|
| 281 |
+
total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
|
| 282 |
+
d.update(total_time=self.format_interval(total_time) + " in total")
|
| 283 |
+
return d
|
| 284 |
+
|
| 285 |
+
>>> with TqdmCallback(
|
| 286 |
+
tqdm_kwargs={
|
| 287 |
+
"desc": "desc",
|
| 288 |
+
"bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
|
| 289 |
+
},
|
| 290 |
+
tqdm_cls=TqdmFormat,
|
| 291 |
+
) as callback:
|
| 292 |
+
fs.upload(".", path2distant_data, recursive=True, callback=callback)
|
| 293 |
+
"""
|
| 294 |
+
|
| 295 |
+
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
|
| 296 |
+
try:
|
| 297 |
+
from tqdm import tqdm
|
| 298 |
+
|
| 299 |
+
except ImportError as exce:
|
| 300 |
+
raise ImportError(
|
| 301 |
+
"Using TqdmCallback requires tqdm to be installed"
|
| 302 |
+
) from exce
|
| 303 |
+
|
| 304 |
+
self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
|
| 305 |
+
self._tqdm_kwargs = tqdm_kwargs or {}
|
| 306 |
+
self.tqdm = None
|
| 307 |
+
super().__init__(*args, **kwargs)
|
| 308 |
+
|
| 309 |
+
def call(self, *args, **kwargs):
|
| 310 |
+
if self.tqdm is None:
|
| 311 |
+
self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
|
| 312 |
+
self.tqdm.total = self.size
|
| 313 |
+
self.tqdm.update(self.value - self.tqdm.n)
|
| 314 |
+
|
| 315 |
+
def close(self):
|
| 316 |
+
if self.tqdm is not None:
|
| 317 |
+
self.tqdm.close()
|
| 318 |
+
self.tqdm = None
|
| 319 |
+
|
| 320 |
+
def __del__(self):
|
| 321 |
+
return self.close()
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()
|
.venv/lib/python3.11/site-packages/fsspec/compression.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helper functions for a standard streaming compression API"""
|
| 2 |
+
|
| 3 |
+
from zipfile import ZipFile
|
| 4 |
+
|
| 5 |
+
import fsspec.utils
|
| 6 |
+
from fsspec.spec import AbstractBufferedFile
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def noop_file(file, mode, **kwargs):
|
| 10 |
+
return file
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# TODO: files should also be available as contexts
|
| 14 |
+
# should be functions of the form func(infile, mode=, **kwargs) -> file-like
|
| 15 |
+
compr = {None: noop_file}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def register_compression(name, callback, extensions, force=False):
|
| 19 |
+
"""Register an "inferable" file compression type.
|
| 20 |
+
|
| 21 |
+
Registers transparent file compression type for use with fsspec.open.
|
| 22 |
+
Compression can be specified by name in open, or "infer"-ed for any files
|
| 23 |
+
ending with the given extensions.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
name: (str) The compression type name. Eg. "gzip".
|
| 27 |
+
callback: A callable of form (infile, mode, **kwargs) -> file-like.
|
| 28 |
+
Accepts an input file-like object, the target mode and kwargs.
|
| 29 |
+
Returns a wrapped file-like object.
|
| 30 |
+
extensions: (str, Iterable[str]) A file extension, or list of file
|
| 31 |
+
extensions for which to infer this compression scheme. Eg. "gz".
|
| 32 |
+
force: (bool) Force re-registration of compression type or extensions.
|
| 33 |
+
|
| 34 |
+
Raises:
|
| 35 |
+
ValueError: If name or extensions already registered, and not force.
|
| 36 |
+
|
| 37 |
+
"""
|
| 38 |
+
if isinstance(extensions, str):
|
| 39 |
+
extensions = [extensions]
|
| 40 |
+
|
| 41 |
+
# Validate registration
|
| 42 |
+
if name in compr and not force:
|
| 43 |
+
raise ValueError(f"Duplicate compression registration: {name}")
|
| 44 |
+
|
| 45 |
+
for ext in extensions:
|
| 46 |
+
if ext in fsspec.utils.compressions and not force:
|
| 47 |
+
raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
|
| 48 |
+
|
| 49 |
+
compr[name] = callback
|
| 50 |
+
|
| 51 |
+
for ext in extensions:
|
| 52 |
+
fsspec.utils.compressions[ext] = name
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def unzip(infile, mode="rb", filename=None, **kwargs):
|
| 56 |
+
if "r" not in mode:
|
| 57 |
+
filename = filename or "file"
|
| 58 |
+
z = ZipFile(infile, mode="w", **kwargs)
|
| 59 |
+
fo = z.open(filename, mode="w")
|
| 60 |
+
fo.close = lambda closer=fo.close: closer() or z.close()
|
| 61 |
+
return fo
|
| 62 |
+
z = ZipFile(infile)
|
| 63 |
+
if filename is None:
|
| 64 |
+
filename = z.namelist()[0]
|
| 65 |
+
return z.open(filename, mode="r", **kwargs)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
register_compression("zip", unzip, "zip")
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
from bz2 import BZ2File
|
| 72 |
+
except ImportError:
|
| 73 |
+
pass
|
| 74 |
+
else:
|
| 75 |
+
register_compression("bz2", BZ2File, "bz2")
|
| 76 |
+
|
| 77 |
+
try: # pragma: no cover
|
| 78 |
+
from isal import igzip
|
| 79 |
+
|
| 80 |
+
def isal(infile, mode="rb", **kwargs):
|
| 81 |
+
return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
|
| 82 |
+
|
| 83 |
+
register_compression("gzip", isal, "gz")
|
| 84 |
+
except ImportError:
|
| 85 |
+
from gzip import GzipFile
|
| 86 |
+
|
| 87 |
+
register_compression(
|
| 88 |
+
"gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
from lzma import LZMAFile
|
| 93 |
+
|
| 94 |
+
register_compression("lzma", LZMAFile, "lzma")
|
| 95 |
+
register_compression("xz", LZMAFile, "xz")
|
| 96 |
+
except ImportError:
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
import lzmaffi
|
| 101 |
+
|
| 102 |
+
register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
|
| 103 |
+
register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
|
| 104 |
+
except ImportError:
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class SnappyFile(AbstractBufferedFile):
|
| 109 |
+
def __init__(self, infile, mode, **kwargs):
|
| 110 |
+
import snappy
|
| 111 |
+
|
| 112 |
+
super().__init__(
|
| 113 |
+
fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
|
| 114 |
+
)
|
| 115 |
+
self.infile = infile
|
| 116 |
+
if "r" in mode:
|
| 117 |
+
self.codec = snappy.StreamDecompressor()
|
| 118 |
+
else:
|
| 119 |
+
self.codec = snappy.StreamCompressor()
|
| 120 |
+
|
| 121 |
+
def _upload_chunk(self, final=False):
|
| 122 |
+
self.buffer.seek(0)
|
| 123 |
+
out = self.codec.add_chunk(self.buffer.read())
|
| 124 |
+
self.infile.write(out)
|
| 125 |
+
return True
|
| 126 |
+
|
| 127 |
+
def seek(self, loc, whence=0):
|
| 128 |
+
raise NotImplementedError("SnappyFile is not seekable")
|
| 129 |
+
|
| 130 |
+
def seekable(self):
|
| 131 |
+
return False
|
| 132 |
+
|
| 133 |
+
def _fetch_range(self, start, end):
|
| 134 |
+
"""Get the specified set of bytes from remote"""
|
| 135 |
+
data = self.infile.read(end - start)
|
| 136 |
+
return self.codec.decompress(data)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
import snappy
|
| 141 |
+
|
| 142 |
+
snappy.compress(b"")
|
| 143 |
+
# Snappy may use the .sz file extension, but this is not part of the
|
| 144 |
+
# standard implementation.
|
| 145 |
+
register_compression("snappy", SnappyFile, [])
|
| 146 |
+
|
| 147 |
+
except (ImportError, NameError, AttributeError):
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
import lz4.frame
|
| 152 |
+
|
| 153 |
+
register_compression("lz4", lz4.frame.open, "lz4")
|
| 154 |
+
except ImportError:
|
| 155 |
+
pass
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
import zstandard as zstd
|
| 159 |
+
|
| 160 |
+
def zstandard_file(infile, mode="rb"):
|
| 161 |
+
if "r" in mode:
|
| 162 |
+
cctx = zstd.ZstdDecompressor()
|
| 163 |
+
return cctx.stream_reader(infile)
|
| 164 |
+
else:
|
| 165 |
+
cctx = zstd.ZstdCompressor(level=10)
|
| 166 |
+
return cctx.stream_writer(infile)
|
| 167 |
+
|
| 168 |
+
register_compression("zstd", zstandard_file, "zst")
|
| 169 |
+
except ImportError:
|
| 170 |
+
pass
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def available_compressions():
|
| 174 |
+
"""Return a list of the implemented compressions."""
|
| 175 |
+
return list(compr)
|
.venv/lib/python3.11/site-packages/fsspec/config.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import configparser
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import warnings
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
conf: dict[str, dict[str, Any]] = {}
|
| 10 |
+
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
|
| 11 |
+
conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def set_conf_env(conf_dict, envdict=os.environ):
|
| 15 |
+
"""Set config values from environment variables
|
| 16 |
+
|
| 17 |
+
Looks for variables of the form ``FSSPEC_<protocol>`` and
|
| 18 |
+
``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
|
| 19 |
+
as a json dictionary and used to ``update`` the config of the
|
| 20 |
+
corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
|
| 21 |
+
attempt to convert the string value, but the kwarg keys will be lower-cased.
|
| 22 |
+
|
| 23 |
+
The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
|
| 24 |
+
``FSSPEC_<protocol>`` ones.
|
| 25 |
+
|
| 26 |
+
Parameters
|
| 27 |
+
----------
|
| 28 |
+
conf_dict : dict(str, dict)
|
| 29 |
+
This dict will be mutated
|
| 30 |
+
envdict : dict-like(str, str)
|
| 31 |
+
Source for the values - usually the real environment
|
| 32 |
+
"""
|
| 33 |
+
kwarg_keys = []
|
| 34 |
+
for key in envdict:
|
| 35 |
+
if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
|
| 36 |
+
if key.count("_") > 1:
|
| 37 |
+
kwarg_keys.append(key)
|
| 38 |
+
continue
|
| 39 |
+
try:
|
| 40 |
+
value = json.loads(envdict[key])
|
| 41 |
+
except json.decoder.JSONDecodeError as ex:
|
| 42 |
+
warnings.warn(
|
| 43 |
+
f"Ignoring environment variable {key} due to a parse failure: {ex}"
|
| 44 |
+
)
|
| 45 |
+
else:
|
| 46 |
+
if isinstance(value, dict):
|
| 47 |
+
_, proto = key.split("_", 1)
|
| 48 |
+
conf_dict.setdefault(proto.lower(), {}).update(value)
|
| 49 |
+
else:
|
| 50 |
+
warnings.warn(
|
| 51 |
+
f"Ignoring environment variable {key} due to not being a dict:"
|
| 52 |
+
f" {type(value)}"
|
| 53 |
+
)
|
| 54 |
+
elif key.startswith("FSSPEC"):
|
| 55 |
+
warnings.warn(
|
| 56 |
+
f"Ignoring environment variable {key} due to having an unexpected name"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
for key in kwarg_keys:
|
| 60 |
+
_, proto, kwarg = key.split("_", 2)
|
| 61 |
+
conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def set_conf_files(cdir, conf_dict):
|
| 65 |
+
"""Set config values from files
|
| 66 |
+
|
| 67 |
+
Scans for INI and JSON files in the given dictionary, and uses their
|
| 68 |
+
contents to set the config. In case of repeated values, later values
|
| 69 |
+
win.
|
| 70 |
+
|
| 71 |
+
In the case of INI files, all values are strings, and these will not
|
| 72 |
+
be converted.
|
| 73 |
+
|
| 74 |
+
Parameters
|
| 75 |
+
----------
|
| 76 |
+
cdir : str
|
| 77 |
+
Directory to search
|
| 78 |
+
conf_dict : dict(str, dict)
|
| 79 |
+
This dict will be mutated
|
| 80 |
+
"""
|
| 81 |
+
if not os.path.isdir(cdir):
|
| 82 |
+
return
|
| 83 |
+
allfiles = sorted(os.listdir(cdir))
|
| 84 |
+
for fn in allfiles:
|
| 85 |
+
if fn.endswith(".ini"):
|
| 86 |
+
ini = configparser.ConfigParser()
|
| 87 |
+
ini.read(os.path.join(cdir, fn))
|
| 88 |
+
for key in ini:
|
| 89 |
+
if key == "DEFAULT":
|
| 90 |
+
continue
|
| 91 |
+
conf_dict.setdefault(key, {}).update(dict(ini[key]))
|
| 92 |
+
if fn.endswith(".json"):
|
| 93 |
+
with open(os.path.join(cdir, fn)) as f:
|
| 94 |
+
js = json.load(f)
|
| 95 |
+
for key in js:
|
| 96 |
+
conf_dict.setdefault(key, {}).update(dict(js[key]))
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def apply_config(cls, kwargs, conf_dict=None):
|
| 100 |
+
"""Supply default values for kwargs when instantiating class
|
| 101 |
+
|
| 102 |
+
Augments the passed kwargs, by finding entries in the config dict
|
| 103 |
+
which match the classes ``.protocol`` attribute (one or more str)
|
| 104 |
+
|
| 105 |
+
Parameters
|
| 106 |
+
----------
|
| 107 |
+
cls : file system implementation
|
| 108 |
+
kwargs : dict
|
| 109 |
+
conf_dict : dict of dict
|
| 110 |
+
Typically this is the global configuration
|
| 111 |
+
|
| 112 |
+
Returns
|
| 113 |
+
-------
|
| 114 |
+
dict : the modified set of kwargs
|
| 115 |
+
"""
|
| 116 |
+
if conf_dict is None:
|
| 117 |
+
conf_dict = conf
|
| 118 |
+
protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
|
| 119 |
+
kw = {}
|
| 120 |
+
for proto in protos:
|
| 121 |
+
# default kwargs from the current state of the config
|
| 122 |
+
if proto in conf_dict:
|
| 123 |
+
kw.update(conf_dict[proto])
|
| 124 |
+
# explicit kwargs always win
|
| 125 |
+
kw.update(**kwargs)
|
| 126 |
+
kwargs = kw
|
| 127 |
+
return kwargs
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
set_conf_files(conf_dir, conf)
|
| 131 |
+
set_conf_env(conf)
|
.venv/lib/python3.11/site-packages/fsspec/conftest.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
import fsspec
|
| 10 |
+
from fsspec.implementations.cached import CachingFileSystem
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@pytest.fixture()
|
| 14 |
+
def m():
|
| 15 |
+
"""
|
| 16 |
+
Fixture providing a memory filesystem.
|
| 17 |
+
"""
|
| 18 |
+
m = fsspec.filesystem("memory")
|
| 19 |
+
m.store.clear()
|
| 20 |
+
m.pseudo_dirs.clear()
|
| 21 |
+
m.pseudo_dirs.append("")
|
| 22 |
+
try:
|
| 23 |
+
yield m
|
| 24 |
+
finally:
|
| 25 |
+
m.store.clear()
|
| 26 |
+
m.pseudo_dirs.clear()
|
| 27 |
+
m.pseudo_dirs.append("")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@pytest.fixture
|
| 31 |
+
def ftp_writable(tmpdir):
|
| 32 |
+
"""
|
| 33 |
+
Fixture providing a writable FTP filesystem.
|
| 34 |
+
"""
|
| 35 |
+
pytest.importorskip("pyftpdlib")
|
| 36 |
+
from fsspec.implementations.ftp import FTPFileSystem
|
| 37 |
+
|
| 38 |
+
FTPFileSystem.clear_instance_cache() # remove lingering connections
|
| 39 |
+
CachingFileSystem.clear_instance_cache()
|
| 40 |
+
d = str(tmpdir)
|
| 41 |
+
with open(os.path.join(d, "out"), "wb") as f:
|
| 42 |
+
f.write(b"hello" * 10000)
|
| 43 |
+
P = subprocess.Popen(
|
| 44 |
+
[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
|
| 45 |
+
)
|
| 46 |
+
try:
|
| 47 |
+
time.sleep(1)
|
| 48 |
+
yield "localhost", 2121, "user", "pass"
|
| 49 |
+
finally:
|
| 50 |
+
P.terminate()
|
| 51 |
+
P.wait()
|
| 52 |
+
try:
|
| 53 |
+
shutil.rmtree(tmpdir)
|
| 54 |
+
except Exception:
|
| 55 |
+
pass
|
.venv/lib/python3.11/site-packages/fsspec/core.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
from glob import has_magic
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# for backwards compat, we export cache things from here too
|
| 11 |
+
from fsspec.caching import ( # noqa: F401
|
| 12 |
+
BaseCache,
|
| 13 |
+
BlockCache,
|
| 14 |
+
BytesCache,
|
| 15 |
+
MMapCache,
|
| 16 |
+
ReadAheadCache,
|
| 17 |
+
caches,
|
| 18 |
+
)
|
| 19 |
+
from fsspec.compression import compr
|
| 20 |
+
from fsspec.config import conf
|
| 21 |
+
from fsspec.registry import filesystem, get_filesystem_class
|
| 22 |
+
from fsspec.utils import (
|
| 23 |
+
_unstrip_protocol,
|
| 24 |
+
build_name_function,
|
| 25 |
+
infer_compression,
|
| 26 |
+
stringify_path,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger("fsspec")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class OpenFile:
|
| 33 |
+
"""
|
| 34 |
+
File-like object to be used in a context
|
| 35 |
+
|
| 36 |
+
Can layer (buffered) text-mode and compression over any file-system, which
|
| 37 |
+
are typically binary-only.
|
| 38 |
+
|
| 39 |
+
These instances are safe to serialize, as the low-level file object
|
| 40 |
+
is not created until invoked using ``with``.
|
| 41 |
+
|
| 42 |
+
Parameters
|
| 43 |
+
----------
|
| 44 |
+
fs: FileSystem
|
| 45 |
+
The file system to use for opening the file. Should be a subclass or duck-type
|
| 46 |
+
with ``fsspec.spec.AbstractFileSystem``
|
| 47 |
+
path: str
|
| 48 |
+
Location to open
|
| 49 |
+
mode: str like 'rb', optional
|
| 50 |
+
Mode of the opened file
|
| 51 |
+
compression: str or None, optional
|
| 52 |
+
Compression to apply
|
| 53 |
+
encoding: str or None, optional
|
| 54 |
+
The encoding to use if opened in text mode.
|
| 55 |
+
errors: str or None, optional
|
| 56 |
+
How to handle encoding errors if opened in text mode.
|
| 57 |
+
newline: None or str
|
| 58 |
+
Passed to TextIOWrapper in text mode, how to handle line endings.
|
| 59 |
+
autoopen: bool
|
| 60 |
+
If True, calls open() immediately. Mostly used by pickle
|
| 61 |
+
pos: int
|
| 62 |
+
If given and autoopen is True, seek to this location immediately
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
def __init__(
|
| 66 |
+
self,
|
| 67 |
+
fs,
|
| 68 |
+
path,
|
| 69 |
+
mode="rb",
|
| 70 |
+
compression=None,
|
| 71 |
+
encoding=None,
|
| 72 |
+
errors=None,
|
| 73 |
+
newline=None,
|
| 74 |
+
):
|
| 75 |
+
self.fs = fs
|
| 76 |
+
self.path = path
|
| 77 |
+
self.mode = mode
|
| 78 |
+
self.compression = get_compression(path, compression)
|
| 79 |
+
self.encoding = encoding
|
| 80 |
+
self.errors = errors
|
| 81 |
+
self.newline = newline
|
| 82 |
+
self.fobjects = []
|
| 83 |
+
|
| 84 |
+
def __reduce__(self):
|
| 85 |
+
return (
|
| 86 |
+
OpenFile,
|
| 87 |
+
(
|
| 88 |
+
self.fs,
|
| 89 |
+
self.path,
|
| 90 |
+
self.mode,
|
| 91 |
+
self.compression,
|
| 92 |
+
self.encoding,
|
| 93 |
+
self.errors,
|
| 94 |
+
self.newline,
|
| 95 |
+
),
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def __repr__(self):
|
| 99 |
+
return f"<OpenFile '{self.path}'>"
|
| 100 |
+
|
| 101 |
+
def __enter__(self):
|
| 102 |
+
mode = self.mode.replace("t", "").replace("b", "") + "b"
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
f = self.fs.open(self.path, mode=mode)
|
| 106 |
+
except FileNotFoundError as e:
|
| 107 |
+
if has_magic(self.path):
|
| 108 |
+
raise FileNotFoundError(
|
| 109 |
+
"%s not found. The URL contains glob characters: you maybe needed\n"
|
| 110 |
+
"to pass expand=True in fsspec.open() or the storage_options of \n"
|
| 111 |
+
"your library. You can also set the config value 'open_expand'\n"
|
| 112 |
+
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
|
| 113 |
+
self.path,
|
| 114 |
+
) from e
|
| 115 |
+
raise
|
| 116 |
+
|
| 117 |
+
self.fobjects = [f]
|
| 118 |
+
|
| 119 |
+
if self.compression is not None:
|
| 120 |
+
compress = compr[self.compression]
|
| 121 |
+
f = compress(f, mode=mode[0])
|
| 122 |
+
self.fobjects.append(f)
|
| 123 |
+
|
| 124 |
+
if "b" not in self.mode:
|
| 125 |
+
# assume, for example, that 'r' is equivalent to 'rt' as in builtin
|
| 126 |
+
f = PickleableTextIOWrapper(
|
| 127 |
+
f, encoding=self.encoding, errors=self.errors, newline=self.newline
|
| 128 |
+
)
|
| 129 |
+
self.fobjects.append(f)
|
| 130 |
+
|
| 131 |
+
return self.fobjects[-1]
|
| 132 |
+
|
| 133 |
+
def __exit__(self, *args):
|
| 134 |
+
self.close()
|
| 135 |
+
|
| 136 |
+
@property
|
| 137 |
+
def full_name(self):
|
| 138 |
+
return _unstrip_protocol(self.path, self.fs)
|
| 139 |
+
|
| 140 |
+
def open(self):
|
| 141 |
+
"""Materialise this as a real open file without context
|
| 142 |
+
|
| 143 |
+
The OpenFile object should be explicitly closed to avoid enclosed file
|
| 144 |
+
instances persisting. You must, therefore, keep a reference to the OpenFile
|
| 145 |
+
during the life of the file-like it generates.
|
| 146 |
+
"""
|
| 147 |
+
return self.__enter__()
|
| 148 |
+
|
| 149 |
+
def close(self):
|
| 150 |
+
"""Close all encapsulated file objects"""
|
| 151 |
+
for f in reversed(self.fobjects):
|
| 152 |
+
if "r" not in self.mode and not f.closed:
|
| 153 |
+
f.flush()
|
| 154 |
+
f.close()
|
| 155 |
+
self.fobjects.clear()
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class OpenFiles(list):
|
| 159 |
+
"""List of OpenFile instances
|
| 160 |
+
|
| 161 |
+
Can be used in a single context, which opens and closes all of the
|
| 162 |
+
contained files. Normal list access to get the elements works as
|
| 163 |
+
normal.
|
| 164 |
+
|
| 165 |
+
A special case is made for caching filesystems - the files will
|
| 166 |
+
be down/uploaded together at the start or end of the context, and
|
| 167 |
+
this may happen concurrently, if the target filesystem supports it.
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
def __init__(self, *args, mode="rb", fs=None):
|
| 171 |
+
self.mode = mode
|
| 172 |
+
self.fs = fs
|
| 173 |
+
self.files = []
|
| 174 |
+
super().__init__(*args)
|
| 175 |
+
|
| 176 |
+
def __enter__(self):
|
| 177 |
+
if self.fs is None:
|
| 178 |
+
raise ValueError("Context has already been used")
|
| 179 |
+
|
| 180 |
+
fs = self.fs
|
| 181 |
+
while True:
|
| 182 |
+
if hasattr(fs, "open_many"):
|
| 183 |
+
# check for concurrent cache download; or set up for upload
|
| 184 |
+
self.files = fs.open_many(self)
|
| 185 |
+
return self.files
|
| 186 |
+
if hasattr(fs, "fs") and fs.fs is not None:
|
| 187 |
+
fs = fs.fs
|
| 188 |
+
else:
|
| 189 |
+
break
|
| 190 |
+
return [s.__enter__() for s in self]
|
| 191 |
+
|
| 192 |
+
def __exit__(self, *args):
|
| 193 |
+
fs = self.fs
|
| 194 |
+
[s.__exit__(*args) for s in self]
|
| 195 |
+
if "r" not in self.mode:
|
| 196 |
+
while True:
|
| 197 |
+
if hasattr(fs, "open_many"):
|
| 198 |
+
# check for concurrent cache upload
|
| 199 |
+
fs.commit_many(self.files)
|
| 200 |
+
return
|
| 201 |
+
if hasattr(fs, "fs") and fs.fs is not None:
|
| 202 |
+
fs = fs.fs
|
| 203 |
+
else:
|
| 204 |
+
break
|
| 205 |
+
|
| 206 |
+
def __getitem__(self, item):
|
| 207 |
+
out = super().__getitem__(item)
|
| 208 |
+
if isinstance(item, slice):
|
| 209 |
+
return OpenFiles(out, mode=self.mode, fs=self.fs)
|
| 210 |
+
return out
|
| 211 |
+
|
| 212 |
+
def __repr__(self):
|
| 213 |
+
return f"<List of {len(self)} OpenFile instances>"
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def open_files(
|
| 217 |
+
urlpath,
|
| 218 |
+
mode="rb",
|
| 219 |
+
compression=None,
|
| 220 |
+
encoding="utf8",
|
| 221 |
+
errors=None,
|
| 222 |
+
name_function=None,
|
| 223 |
+
num=1,
|
| 224 |
+
protocol=None,
|
| 225 |
+
newline=None,
|
| 226 |
+
auto_mkdir=True,
|
| 227 |
+
expand=True,
|
| 228 |
+
**kwargs,
|
| 229 |
+
):
|
| 230 |
+
"""Given a path or paths, return a list of ``OpenFile`` objects.
|
| 231 |
+
|
| 232 |
+
For writing, a str path must contain the "*" character, which will be filled
|
| 233 |
+
in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
|
| 234 |
+
|
| 235 |
+
For either reading or writing, can instead provide explicit list of paths.
|
| 236 |
+
|
| 237 |
+
Parameters
|
| 238 |
+
----------
|
| 239 |
+
urlpath: string or list
|
| 240 |
+
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
|
| 241 |
+
to read from alternative filesystems. To read from multiple files you
|
| 242 |
+
can pass a globstring or a list of paths, with the caveat that they
|
| 243 |
+
must all have the same protocol.
|
| 244 |
+
mode: 'rb', 'wt', etc.
|
| 245 |
+
compression: string or None
|
| 246 |
+
If given, open file using compression codec. Can either be a compression
|
| 247 |
+
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
| 248 |
+
compression from the filename suffix.
|
| 249 |
+
encoding: str
|
| 250 |
+
For text mode only
|
| 251 |
+
errors: None or str
|
| 252 |
+
Passed to TextIOWrapper in text mode
|
| 253 |
+
name_function: function or None
|
| 254 |
+
if opening a set of files for writing, those files do not yet exist,
|
| 255 |
+
so we need to generate their names by formatting the urlpath for
|
| 256 |
+
each sequence number
|
| 257 |
+
num: int [1]
|
| 258 |
+
if writing mode, number of files we expect to create (passed to
|
| 259 |
+
name+function)
|
| 260 |
+
protocol: str or None
|
| 261 |
+
If given, overrides the protocol found in the URL.
|
| 262 |
+
newline: bytes or None
|
| 263 |
+
Used for line terminator in text mode. If None, uses system default;
|
| 264 |
+
if blank, uses no translation.
|
| 265 |
+
auto_mkdir: bool (True)
|
| 266 |
+
If in write mode, this will ensure the target directory exists before
|
| 267 |
+
writing, by calling ``fs.mkdirs(exist_ok=True)``.
|
| 268 |
+
expand: bool
|
| 269 |
+
**kwargs: dict
|
| 270 |
+
Extra options that make sense to a particular storage connection, e.g.
|
| 271 |
+
host, port, username, password, etc.
|
| 272 |
+
|
| 273 |
+
Examples
|
| 274 |
+
--------
|
| 275 |
+
>>> files = open_files('2015-*-*.csv') # doctest: +SKIP
|
| 276 |
+
>>> files = open_files(
|
| 277 |
+
... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
|
| 278 |
+
... ) # doctest: +SKIP
|
| 279 |
+
|
| 280 |
+
Returns
|
| 281 |
+
-------
|
| 282 |
+
An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
|
| 283 |
+
be used as a single context
|
| 284 |
+
|
| 285 |
+
Notes
|
| 286 |
+
-----
|
| 287 |
+
For a full list of the available protocols and the implementations that
|
| 288 |
+
they map across to see the latest online documentation:
|
| 289 |
+
|
| 290 |
+
- For implementations built into ``fsspec`` see
|
| 291 |
+
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
| 292 |
+
- For implementations in separate packages see
|
| 293 |
+
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
| 294 |
+
"""
|
| 295 |
+
fs, fs_token, paths = get_fs_token_paths(
|
| 296 |
+
urlpath,
|
| 297 |
+
mode,
|
| 298 |
+
num=num,
|
| 299 |
+
name_function=name_function,
|
| 300 |
+
storage_options=kwargs,
|
| 301 |
+
protocol=protocol,
|
| 302 |
+
expand=expand,
|
| 303 |
+
)
|
| 304 |
+
if fs.protocol == "file":
|
| 305 |
+
fs.auto_mkdir = auto_mkdir
|
| 306 |
+
elif "r" not in mode and auto_mkdir:
|
| 307 |
+
parents = {fs._parent(path) for path in paths}
|
| 308 |
+
for parent in parents:
|
| 309 |
+
try:
|
| 310 |
+
fs.makedirs(parent, exist_ok=True)
|
| 311 |
+
except PermissionError:
|
| 312 |
+
pass
|
| 313 |
+
return OpenFiles(
|
| 314 |
+
[
|
| 315 |
+
OpenFile(
|
| 316 |
+
fs,
|
| 317 |
+
path,
|
| 318 |
+
mode=mode,
|
| 319 |
+
compression=compression,
|
| 320 |
+
encoding=encoding,
|
| 321 |
+
errors=errors,
|
| 322 |
+
newline=newline,
|
| 323 |
+
)
|
| 324 |
+
for path in paths
|
| 325 |
+
],
|
| 326 |
+
mode=mode,
|
| 327 |
+
fs=fs,
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def _un_chain(path, kwargs):
|
| 332 |
+
# Avoid a circular import
|
| 333 |
+
from fsspec.implementations.cached import CachingFileSystem
|
| 334 |
+
|
| 335 |
+
if "::" in path:
|
| 336 |
+
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
| 337 |
+
bits = []
|
| 338 |
+
for p in path.split("::"):
|
| 339 |
+
if "://" in p or x.match(p):
|
| 340 |
+
bits.append(p)
|
| 341 |
+
else:
|
| 342 |
+
bits.append(p + "://")
|
| 343 |
+
else:
|
| 344 |
+
bits = [path]
|
| 345 |
+
# [[url, protocol, kwargs], ...]
|
| 346 |
+
out = []
|
| 347 |
+
previous_bit = None
|
| 348 |
+
kwargs = kwargs.copy()
|
| 349 |
+
for bit in reversed(bits):
|
| 350 |
+
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
| 351 |
+
cls = get_filesystem_class(protocol)
|
| 352 |
+
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
| 353 |
+
kws = kwargs.pop(protocol, {})
|
| 354 |
+
if bit is bits[0]:
|
| 355 |
+
kws.update(kwargs)
|
| 356 |
+
kw = dict(
|
| 357 |
+
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
| 358 |
+
**kws,
|
| 359 |
+
)
|
| 360 |
+
bit = cls._strip_protocol(bit)
|
| 361 |
+
if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
|
| 362 |
+
bit = previous_bit
|
| 363 |
+
out.append((bit, protocol, kw))
|
| 364 |
+
previous_bit = bit
|
| 365 |
+
out.reverse()
|
| 366 |
+
return out
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def url_to_fs(url, **kwargs):
|
| 370 |
+
"""
|
| 371 |
+
Turn fully-qualified and potentially chained URL into filesystem instance
|
| 372 |
+
|
| 373 |
+
Parameters
|
| 374 |
+
----------
|
| 375 |
+
url : str
|
| 376 |
+
The fsspec-compatible URL
|
| 377 |
+
**kwargs: dict
|
| 378 |
+
Extra options that make sense to a particular storage connection, e.g.
|
| 379 |
+
host, port, username, password, etc.
|
| 380 |
+
|
| 381 |
+
Returns
|
| 382 |
+
-------
|
| 383 |
+
filesystem : FileSystem
|
| 384 |
+
The new filesystem discovered from ``url`` and created with
|
| 385 |
+
``**kwargs``.
|
| 386 |
+
urlpath : str
|
| 387 |
+
The file-systems-specific URL for ``url``.
|
| 388 |
+
"""
|
| 389 |
+
url = stringify_path(url)
|
| 390 |
+
# non-FS arguments that appear in fsspec.open()
|
| 391 |
+
# inspect could keep this in sync with open()'s signature
|
| 392 |
+
known_kwargs = {
|
| 393 |
+
"compression",
|
| 394 |
+
"encoding",
|
| 395 |
+
"errors",
|
| 396 |
+
"expand",
|
| 397 |
+
"mode",
|
| 398 |
+
"name_function",
|
| 399 |
+
"newline",
|
| 400 |
+
"num",
|
| 401 |
+
}
|
| 402 |
+
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
|
| 403 |
+
chain = _un_chain(url, kwargs)
|
| 404 |
+
inkwargs = {}
|
| 405 |
+
# Reverse iterate the chain, creating a nested target_* structure
|
| 406 |
+
for i, ch in enumerate(reversed(chain)):
|
| 407 |
+
urls, protocol, kw = ch
|
| 408 |
+
if i == len(chain) - 1:
|
| 409 |
+
inkwargs = dict(**kw, **inkwargs)
|
| 410 |
+
continue
|
| 411 |
+
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
| 412 |
+
inkwargs["target_protocol"] = protocol
|
| 413 |
+
inkwargs["fo"] = urls
|
| 414 |
+
urlpath, protocol, _ = chain[0]
|
| 415 |
+
fs = filesystem(protocol, **inkwargs)
|
| 416 |
+
return fs, urlpath
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
DEFAULT_EXPAND = conf.get("open_expand", False)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
def open(
|
| 423 |
+
urlpath,
|
| 424 |
+
mode="rb",
|
| 425 |
+
compression=None,
|
| 426 |
+
encoding="utf8",
|
| 427 |
+
errors=None,
|
| 428 |
+
protocol=None,
|
| 429 |
+
newline=None,
|
| 430 |
+
expand=None,
|
| 431 |
+
**kwargs,
|
| 432 |
+
):
|
| 433 |
+
"""Given a path or paths, return one ``OpenFile`` object.
|
| 434 |
+
|
| 435 |
+
Parameters
|
| 436 |
+
----------
|
| 437 |
+
urlpath: string or list
|
| 438 |
+
Absolute or relative filepath. Prefix with a protocol like ``s3://``
|
| 439 |
+
to read from alternative filesystems. Should not include glob
|
| 440 |
+
character(s).
|
| 441 |
+
mode: 'rb', 'wt', etc.
|
| 442 |
+
compression: string or None
|
| 443 |
+
If given, open file using compression codec. Can either be a compression
|
| 444 |
+
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
| 445 |
+
compression from the filename suffix.
|
| 446 |
+
encoding: str
|
| 447 |
+
For text mode only
|
| 448 |
+
errors: None or str
|
| 449 |
+
Passed to TextIOWrapper in text mode
|
| 450 |
+
protocol: str or None
|
| 451 |
+
If given, overrides the protocol found in the URL.
|
| 452 |
+
newline: bytes or None
|
| 453 |
+
Used for line terminator in text mode. If None, uses system default;
|
| 454 |
+
if blank, uses no translation.
|
| 455 |
+
expand: bool or Nonw
|
| 456 |
+
Whether to regard file paths containing special glob characters as needing
|
| 457 |
+
expansion (finding the first match) or absolute. Setting False allows using
|
| 458 |
+
paths which do embed such characters. If None (default), this argument
|
| 459 |
+
takes its value from the DEFAULT_EXPAND module variable, which takes
|
| 460 |
+
its initial value from the "open_expand" config value at startup, which will
|
| 461 |
+
be False if not set.
|
| 462 |
+
**kwargs: dict
|
| 463 |
+
Extra options that make sense to a particular storage connection, e.g.
|
| 464 |
+
host, port, username, password, etc.
|
| 465 |
+
|
| 466 |
+
Examples
|
| 467 |
+
--------
|
| 468 |
+
>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
|
| 469 |
+
>>> openfile = open(
|
| 470 |
+
... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
|
| 471 |
+
... ) # doctest: +SKIP
|
| 472 |
+
>>> with openfile as f:
|
| 473 |
+
... df = pd.read_csv(f) # doctest: +SKIP
|
| 474 |
+
...
|
| 475 |
+
|
| 476 |
+
Returns
|
| 477 |
+
-------
|
| 478 |
+
``OpenFile`` object.
|
| 479 |
+
|
| 480 |
+
Notes
|
| 481 |
+
-----
|
| 482 |
+
For a full list of the available protocols and the implementations that
|
| 483 |
+
they map across to see the latest online documentation:
|
| 484 |
+
|
| 485 |
+
- For implementations built into ``fsspec`` see
|
| 486 |
+
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
| 487 |
+
- For implementations in separate packages see
|
| 488 |
+
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
| 489 |
+
"""
|
| 490 |
+
expand = DEFAULT_EXPAND if expand is None else expand
|
| 491 |
+
out = open_files(
|
| 492 |
+
urlpath=[urlpath],
|
| 493 |
+
mode=mode,
|
| 494 |
+
compression=compression,
|
| 495 |
+
encoding=encoding,
|
| 496 |
+
errors=errors,
|
| 497 |
+
protocol=protocol,
|
| 498 |
+
newline=newline,
|
| 499 |
+
expand=expand,
|
| 500 |
+
**kwargs,
|
| 501 |
+
)
|
| 502 |
+
if not out:
|
| 503 |
+
raise FileNotFoundError(urlpath)
|
| 504 |
+
return out[0]
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
def open_local(
|
| 508 |
+
url: str | list[str] | Path | list[Path],
|
| 509 |
+
mode: str = "rb",
|
| 510 |
+
**storage_options: dict,
|
| 511 |
+
) -> str | list[str]:
|
| 512 |
+
"""Open file(s) which can be resolved to local
|
| 513 |
+
|
| 514 |
+
For files which either are local, or get downloaded upon open
|
| 515 |
+
(e.g., by file caching)
|
| 516 |
+
|
| 517 |
+
Parameters
|
| 518 |
+
----------
|
| 519 |
+
url: str or list(str)
|
| 520 |
+
mode: str
|
| 521 |
+
Must be read mode
|
| 522 |
+
storage_options:
|
| 523 |
+
passed on to FS for or used by open_files (e.g., compression)
|
| 524 |
+
"""
|
| 525 |
+
if "r" not in mode:
|
| 526 |
+
raise ValueError("Can only ensure local files when reading")
|
| 527 |
+
of = open_files(url, mode=mode, **storage_options)
|
| 528 |
+
if not getattr(of[0].fs, "local_file", False):
|
| 529 |
+
raise ValueError(
|
| 530 |
+
"open_local can only be used on a filesystem which"
|
| 531 |
+
" has attribute local_file=True"
|
| 532 |
+
)
|
| 533 |
+
with of as files:
|
| 534 |
+
paths = [f.name for f in files]
|
| 535 |
+
if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
|
| 536 |
+
return paths[0]
|
| 537 |
+
return paths
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
def get_compression(urlpath, compression):
|
| 541 |
+
if compression == "infer":
|
| 542 |
+
compression = infer_compression(urlpath)
|
| 543 |
+
if compression is not None and compression not in compr:
|
| 544 |
+
raise ValueError(f"Compression type {compression} not supported")
|
| 545 |
+
return compression
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
def split_protocol(urlpath):
|
| 549 |
+
"""Return protocol, path pair"""
|
| 550 |
+
urlpath = stringify_path(urlpath)
|
| 551 |
+
if "://" in urlpath:
|
| 552 |
+
protocol, path = urlpath.split("://", 1)
|
| 553 |
+
if len(protocol) > 1:
|
| 554 |
+
# excludes Windows paths
|
| 555 |
+
return protocol, path
|
| 556 |
+
if urlpath.startswith("data:"):
|
| 557 |
+
return urlpath.split(":", 1)
|
| 558 |
+
return None, urlpath
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
def strip_protocol(urlpath):
|
| 562 |
+
"""Return only path part of full URL, according to appropriate backend"""
|
| 563 |
+
protocol, _ = split_protocol(urlpath)
|
| 564 |
+
cls = get_filesystem_class(protocol)
|
| 565 |
+
return cls._strip_protocol(urlpath)
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
def expand_paths_if_needed(paths, mode, num, fs, name_function):
|
| 569 |
+
"""Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
|
| 570 |
+
in them (read mode).
|
| 571 |
+
|
| 572 |
+
:param paths: list of paths
|
| 573 |
+
mode: str
|
| 574 |
+
Mode in which to open files.
|
| 575 |
+
num: int
|
| 576 |
+
If opening in writing mode, number of files we expect to create.
|
| 577 |
+
fs: filesystem object
|
| 578 |
+
name_function: callable
|
| 579 |
+
If opening in writing mode, this callable is used to generate path
|
| 580 |
+
names. Names are generated for each partition by
|
| 581 |
+
``urlpath.replace('*', name_function(partition_index))``.
|
| 582 |
+
:return: list of paths
|
| 583 |
+
"""
|
| 584 |
+
expanded_paths = []
|
| 585 |
+
paths = list(paths)
|
| 586 |
+
|
| 587 |
+
if "w" in mode: # read mode
|
| 588 |
+
if sum(1 for p in paths if "*" in p) > 1:
|
| 589 |
+
raise ValueError(
|
| 590 |
+
"When writing data, only one filename mask can be specified."
|
| 591 |
+
)
|
| 592 |
+
num = max(num, len(paths))
|
| 593 |
+
|
| 594 |
+
for curr_path in paths:
|
| 595 |
+
if "*" in curr_path:
|
| 596 |
+
# expand using name_function
|
| 597 |
+
expanded_paths.extend(_expand_paths(curr_path, name_function, num))
|
| 598 |
+
else:
|
| 599 |
+
expanded_paths.append(curr_path)
|
| 600 |
+
# if we generated more paths that asked for, trim the list
|
| 601 |
+
if len(expanded_paths) > num:
|
| 602 |
+
expanded_paths = expanded_paths[:num]
|
| 603 |
+
|
| 604 |
+
else: # read mode
|
| 605 |
+
for curr_path in paths:
|
| 606 |
+
if has_magic(curr_path):
|
| 607 |
+
# expand using glob
|
| 608 |
+
expanded_paths.extend(fs.glob(curr_path))
|
| 609 |
+
else:
|
| 610 |
+
expanded_paths.append(curr_path)
|
| 611 |
+
|
| 612 |
+
return expanded_paths
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def get_fs_token_paths(
|
| 616 |
+
urlpath,
|
| 617 |
+
mode="rb",
|
| 618 |
+
num=1,
|
| 619 |
+
name_function=None,
|
| 620 |
+
storage_options=None,
|
| 621 |
+
protocol=None,
|
| 622 |
+
expand=True,
|
| 623 |
+
):
|
| 624 |
+
"""Filesystem, deterministic token, and paths from a urlpath and options.
|
| 625 |
+
|
| 626 |
+
Parameters
|
| 627 |
+
----------
|
| 628 |
+
urlpath: string or iterable
|
| 629 |
+
Absolute or relative filepath, URL (may include protocols like
|
| 630 |
+
``s3://``), or globstring pointing to data.
|
| 631 |
+
mode: str, optional
|
| 632 |
+
Mode in which to open files.
|
| 633 |
+
num: int, optional
|
| 634 |
+
If opening in writing mode, number of files we expect to create.
|
| 635 |
+
name_function: callable, optional
|
| 636 |
+
If opening in writing mode, this callable is used to generate path
|
| 637 |
+
names. Names are generated for each partition by
|
| 638 |
+
``urlpath.replace('*', name_function(partition_index))``.
|
| 639 |
+
storage_options: dict, optional
|
| 640 |
+
Additional keywords to pass to the filesystem class.
|
| 641 |
+
protocol: str or None
|
| 642 |
+
To override the protocol specifier in the URL
|
| 643 |
+
expand: bool
|
| 644 |
+
Expand string paths for writing, assuming the path is a directory
|
| 645 |
+
"""
|
| 646 |
+
if isinstance(urlpath, (list, tuple, set)):
|
| 647 |
+
if not urlpath:
|
| 648 |
+
raise ValueError("empty urlpath sequence")
|
| 649 |
+
urlpath0 = stringify_path(next(iter(urlpath)))
|
| 650 |
+
else:
|
| 651 |
+
urlpath0 = stringify_path(urlpath)
|
| 652 |
+
storage_options = storage_options or {}
|
| 653 |
+
if protocol:
|
| 654 |
+
storage_options["protocol"] = protocol
|
| 655 |
+
chain = _un_chain(urlpath0, storage_options or {})
|
| 656 |
+
inkwargs = {}
|
| 657 |
+
# Reverse iterate the chain, creating a nested target_* structure
|
| 658 |
+
for i, ch in enumerate(reversed(chain)):
|
| 659 |
+
urls, nested_protocol, kw = ch
|
| 660 |
+
if i == len(chain) - 1:
|
| 661 |
+
inkwargs = dict(**kw, **inkwargs)
|
| 662 |
+
continue
|
| 663 |
+
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
| 664 |
+
inkwargs["target_protocol"] = nested_protocol
|
| 665 |
+
inkwargs["fo"] = urls
|
| 666 |
+
paths, protocol, _ = chain[0]
|
| 667 |
+
fs = filesystem(protocol, **inkwargs)
|
| 668 |
+
if isinstance(urlpath, (list, tuple, set)):
|
| 669 |
+
pchains = [
|
| 670 |
+
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
|
| 671 |
+
]
|
| 672 |
+
if len({pc[1] for pc in pchains}) > 1:
|
| 673 |
+
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
|
| 674 |
+
paths = [pc[0] for pc in pchains]
|
| 675 |
+
else:
|
| 676 |
+
paths = fs._strip_protocol(paths)
|
| 677 |
+
if isinstance(paths, (list, tuple, set)):
|
| 678 |
+
if expand:
|
| 679 |
+
paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
|
| 680 |
+
elif not isinstance(paths, list):
|
| 681 |
+
paths = list(paths)
|
| 682 |
+
else:
|
| 683 |
+
if ("w" in mode or "x" in mode) and expand:
|
| 684 |
+
paths = _expand_paths(paths, name_function, num)
|
| 685 |
+
elif "*" in paths:
|
| 686 |
+
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
|
| 687 |
+
else:
|
| 688 |
+
paths = [paths]
|
| 689 |
+
|
| 690 |
+
return fs, fs._fs_token, paths
|
| 691 |
+
|
| 692 |
+
|
| 693 |
+
def _expand_paths(path, name_function, num):
|
| 694 |
+
if isinstance(path, str):
|
| 695 |
+
if path.count("*") > 1:
|
| 696 |
+
raise ValueError("Output path spec must contain exactly one '*'.")
|
| 697 |
+
elif "*" not in path:
|
| 698 |
+
path = os.path.join(path, "*.part")
|
| 699 |
+
|
| 700 |
+
if name_function is None:
|
| 701 |
+
name_function = build_name_function(num - 1)
|
| 702 |
+
|
| 703 |
+
paths = [path.replace("*", name_function(i)) for i in range(num)]
|
| 704 |
+
if paths != sorted(paths):
|
| 705 |
+
logger.warning(
|
| 706 |
+
"In order to preserve order between partitions"
|
| 707 |
+
" paths created with ``name_function`` should "
|
| 708 |
+
"sort to partition order"
|
| 709 |
+
)
|
| 710 |
+
elif isinstance(path, (tuple, list)):
|
| 711 |
+
assert len(path) == num
|
| 712 |
+
paths = list(path)
|
| 713 |
+
else:
|
| 714 |
+
raise ValueError(
|
| 715 |
+
"Path should be either\n"
|
| 716 |
+
"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
|
| 717 |
+
"2. A directory: 'foo/\n"
|
| 718 |
+
"3. A path with a '*' in it: 'foo.*.json'"
|
| 719 |
+
)
|
| 720 |
+
return paths
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
class PickleableTextIOWrapper(io.TextIOWrapper):
|
| 724 |
+
"""TextIOWrapper cannot be pickled. This solves it.
|
| 725 |
+
|
| 726 |
+
Requires that ``buffer`` be pickleable, which all instances of
|
| 727 |
+
AbstractBufferedFile are.
|
| 728 |
+
"""
|
| 729 |
+
|
| 730 |
+
def __init__(
|
| 731 |
+
self,
|
| 732 |
+
buffer,
|
| 733 |
+
encoding=None,
|
| 734 |
+
errors=None,
|
| 735 |
+
newline=None,
|
| 736 |
+
line_buffering=False,
|
| 737 |
+
write_through=False,
|
| 738 |
+
):
|
| 739 |
+
self.args = buffer, encoding, errors, newline, line_buffering, write_through
|
| 740 |
+
super().__init__(*self.args)
|
| 741 |
+
|
| 742 |
+
def __reduce__(self):
|
| 743 |
+
return PickleableTextIOWrapper, self.args
|
.venv/lib/python3.11/site-packages/fsspec/dircache.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from collections.abc import MutableMapping
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class DirCache(MutableMapping):
|
| 7 |
+
"""
|
| 8 |
+
Caching of directory listings, in a structure like::
|
| 9 |
+
|
| 10 |
+
{"path0": [
|
| 11 |
+
{"name": "path0/file0",
|
| 12 |
+
"size": 123,
|
| 13 |
+
"type": "file",
|
| 14 |
+
...
|
| 15 |
+
},
|
| 16 |
+
{"name": "path0/file1",
|
| 17 |
+
},
|
| 18 |
+
...
|
| 19 |
+
],
|
| 20 |
+
"path1": [...]
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
Parameters to this class control listing expiry or indeed turn
|
| 24 |
+
caching off
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
use_listings_cache=True,
|
| 30 |
+
listings_expiry_time=None,
|
| 31 |
+
max_paths=None,
|
| 32 |
+
**kwargs,
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
Parameters
|
| 37 |
+
----------
|
| 38 |
+
use_listings_cache: bool
|
| 39 |
+
If False, this cache never returns items, but always reports KeyError,
|
| 40 |
+
and setting items has no effect
|
| 41 |
+
listings_expiry_time: int or float (optional)
|
| 42 |
+
Time in seconds that a listing is considered valid. If None,
|
| 43 |
+
listings do not expire.
|
| 44 |
+
max_paths: int (optional)
|
| 45 |
+
The number of most recent listings that are considered valid; 'recent'
|
| 46 |
+
refers to when the entry was set.
|
| 47 |
+
"""
|
| 48 |
+
self._cache = {}
|
| 49 |
+
self._times = {}
|
| 50 |
+
if max_paths:
|
| 51 |
+
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
|
| 52 |
+
self.use_listings_cache = use_listings_cache
|
| 53 |
+
self.listings_expiry_time = listings_expiry_time
|
| 54 |
+
self.max_paths = max_paths
|
| 55 |
+
|
| 56 |
+
def __getitem__(self, item):
|
| 57 |
+
if self.listings_expiry_time is not None:
|
| 58 |
+
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
|
| 59 |
+
del self._cache[item]
|
| 60 |
+
if self.max_paths:
|
| 61 |
+
self._q(item)
|
| 62 |
+
return self._cache[item] # maybe raises KeyError
|
| 63 |
+
|
| 64 |
+
def clear(self):
|
| 65 |
+
self._cache.clear()
|
| 66 |
+
|
| 67 |
+
def __len__(self):
|
| 68 |
+
return len(self._cache)
|
| 69 |
+
|
| 70 |
+
def __contains__(self, item):
|
| 71 |
+
try:
|
| 72 |
+
self[item]
|
| 73 |
+
return True
|
| 74 |
+
except KeyError:
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
def __setitem__(self, key, value):
|
| 78 |
+
if not self.use_listings_cache:
|
| 79 |
+
return
|
| 80 |
+
if self.max_paths:
|
| 81 |
+
self._q(key)
|
| 82 |
+
self._cache[key] = value
|
| 83 |
+
if self.listings_expiry_time is not None:
|
| 84 |
+
self._times[key] = time.time()
|
| 85 |
+
|
| 86 |
+
def __delitem__(self, key):
|
| 87 |
+
del self._cache[key]
|
| 88 |
+
|
| 89 |
+
def __iter__(self):
|
| 90 |
+
entries = list(self._cache)
|
| 91 |
+
|
| 92 |
+
return (k for k in entries if k in self)
|
| 93 |
+
|
| 94 |
+
def __reduce__(self):
|
| 95 |
+
return (
|
| 96 |
+
DirCache,
|
| 97 |
+
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
|
| 98 |
+
)
|
.venv/lib/python3.11/site-packages/fsspec/exceptions.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
fsspec user-defined exception classes
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BlocksizeMismatchError(ValueError):
|
| 9 |
+
"""
|
| 10 |
+
Raised when a cached file is opened with a different blocksize than it was
|
| 11 |
+
written with
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FSTimeoutError(asyncio.TimeoutError):
|
| 16 |
+
"""
|
| 17 |
+
Raised when a fsspec function timed out occurs
|
| 18 |
+
"""
|
.venv/lib/python3.11/site-packages/fsspec/fuse.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import stat
|
| 5 |
+
import threading
|
| 6 |
+
import time
|
| 7 |
+
from errno import EIO, ENOENT
|
| 8 |
+
|
| 9 |
+
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
|
| 10 |
+
|
| 11 |
+
from fsspec import __version__
|
| 12 |
+
from fsspec.core import url_to_fs
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger("fsspec.fuse")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class FUSEr(Operations):
|
| 18 |
+
def __init__(self, fs, path, ready_file=False):
|
| 19 |
+
self.fs = fs
|
| 20 |
+
self.cache = {}
|
| 21 |
+
self.root = path.rstrip("/") + "/"
|
| 22 |
+
self.counter = 0
|
| 23 |
+
logger.info("Starting FUSE at %s", path)
|
| 24 |
+
self._ready_file = ready_file
|
| 25 |
+
|
| 26 |
+
def getattr(self, path, fh=None):
|
| 27 |
+
logger.debug("getattr %s", path)
|
| 28 |
+
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
| 29 |
+
return {"type": "file", "st_size": 5}
|
| 30 |
+
|
| 31 |
+
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
|
| 32 |
+
try:
|
| 33 |
+
info = self.fs.info(path)
|
| 34 |
+
except FileNotFoundError as exc:
|
| 35 |
+
raise FuseOSError(ENOENT) from exc
|
| 36 |
+
|
| 37 |
+
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
|
| 38 |
+
perm = info.get("mode", 0o777)
|
| 39 |
+
|
| 40 |
+
if info["type"] != "file":
|
| 41 |
+
data["st_mode"] = stat.S_IFDIR | perm
|
| 42 |
+
data["st_size"] = 0
|
| 43 |
+
data["st_blksize"] = 0
|
| 44 |
+
else:
|
| 45 |
+
data["st_mode"] = stat.S_IFREG | perm
|
| 46 |
+
data["st_size"] = info["size"]
|
| 47 |
+
data["st_blksize"] = 5 * 2**20
|
| 48 |
+
data["st_nlink"] = 1
|
| 49 |
+
data["st_atime"] = info["atime"] if "atime" in info else time.time()
|
| 50 |
+
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
|
| 51 |
+
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
|
| 52 |
+
return data
|
| 53 |
+
|
| 54 |
+
def readdir(self, path, fh):
|
| 55 |
+
logger.debug("readdir %s", path)
|
| 56 |
+
path = "".join([self.root, path.lstrip("/")])
|
| 57 |
+
files = self.fs.ls(path, False)
|
| 58 |
+
files = [os.path.basename(f.rstrip("/")) for f in files]
|
| 59 |
+
return [".", ".."] + files
|
| 60 |
+
|
| 61 |
+
def mkdir(self, path, mode):
|
| 62 |
+
path = "".join([self.root, path.lstrip("/")])
|
| 63 |
+
self.fs.mkdir(path)
|
| 64 |
+
return 0
|
| 65 |
+
|
| 66 |
+
def rmdir(self, path):
|
| 67 |
+
path = "".join([self.root, path.lstrip("/")])
|
| 68 |
+
self.fs.rmdir(path)
|
| 69 |
+
return 0
|
| 70 |
+
|
| 71 |
+
def read(self, path, size, offset, fh):
|
| 72 |
+
logger.debug("read %s", (path, size, offset))
|
| 73 |
+
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
| 74 |
+
# status indicator
|
| 75 |
+
return b"ready"
|
| 76 |
+
|
| 77 |
+
f = self.cache[fh]
|
| 78 |
+
f.seek(offset)
|
| 79 |
+
out = f.read(size)
|
| 80 |
+
return out
|
| 81 |
+
|
| 82 |
+
def write(self, path, data, offset, fh):
|
| 83 |
+
logger.debug("write %s", (path, offset))
|
| 84 |
+
f = self.cache[fh]
|
| 85 |
+
f.seek(offset)
|
| 86 |
+
f.write(data)
|
| 87 |
+
return len(data)
|
| 88 |
+
|
| 89 |
+
def create(self, path, flags, fi=None):
|
| 90 |
+
logger.debug("create %s", (path, flags))
|
| 91 |
+
fn = "".join([self.root, path.lstrip("/")])
|
| 92 |
+
self.fs.touch(fn) # OS will want to get attributes immediately
|
| 93 |
+
f = self.fs.open(fn, "wb")
|
| 94 |
+
self.cache[self.counter] = f
|
| 95 |
+
self.counter += 1
|
| 96 |
+
return self.counter - 1
|
| 97 |
+
|
| 98 |
+
def open(self, path, flags):
|
| 99 |
+
logger.debug("open %s", (path, flags))
|
| 100 |
+
fn = "".join([self.root, path.lstrip("/")])
|
| 101 |
+
if flags % 2 == 0:
|
| 102 |
+
# read
|
| 103 |
+
mode = "rb"
|
| 104 |
+
else:
|
| 105 |
+
# write/create
|
| 106 |
+
mode = "wb"
|
| 107 |
+
self.cache[self.counter] = self.fs.open(fn, mode)
|
| 108 |
+
self.counter += 1
|
| 109 |
+
return self.counter - 1
|
| 110 |
+
|
| 111 |
+
def truncate(self, path, length, fh=None):
|
| 112 |
+
fn = "".join([self.root, path.lstrip("/")])
|
| 113 |
+
if length != 0:
|
| 114 |
+
raise NotImplementedError
|
| 115 |
+
# maybe should be no-op since open with write sets size to zero anyway
|
| 116 |
+
self.fs.touch(fn)
|
| 117 |
+
|
| 118 |
+
def unlink(self, path):
|
| 119 |
+
fn = "".join([self.root, path.lstrip("/")])
|
| 120 |
+
try:
|
| 121 |
+
self.fs.rm(fn, False)
|
| 122 |
+
except (OSError, FileNotFoundError) as exc:
|
| 123 |
+
raise FuseOSError(EIO) from exc
|
| 124 |
+
|
| 125 |
+
def release(self, path, fh):
|
| 126 |
+
try:
|
| 127 |
+
if fh in self.cache:
|
| 128 |
+
f = self.cache[fh]
|
| 129 |
+
f.close()
|
| 130 |
+
self.cache.pop(fh)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(e)
|
| 133 |
+
return 0
|
| 134 |
+
|
| 135 |
+
def chmod(self, path, mode):
|
| 136 |
+
if hasattr(self.fs, "chmod"):
|
| 137 |
+
path = "".join([self.root, path.lstrip("/")])
|
| 138 |
+
return self.fs.chmod(path, mode)
|
| 139 |
+
raise NotImplementedError
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def run(
|
| 143 |
+
fs,
|
| 144 |
+
path,
|
| 145 |
+
mount_point,
|
| 146 |
+
foreground=True,
|
| 147 |
+
threads=False,
|
| 148 |
+
ready_file=False,
|
| 149 |
+
ops_class=FUSEr,
|
| 150 |
+
):
|
| 151 |
+
"""Mount stuff in a local directory
|
| 152 |
+
|
| 153 |
+
This uses fusepy to make it appear as if a given path on an fsspec
|
| 154 |
+
instance is in fact resident within the local file-system.
|
| 155 |
+
|
| 156 |
+
This requires that fusepy by installed, and that FUSE be available on
|
| 157 |
+
the system (typically requiring a package to be installed with
|
| 158 |
+
apt, yum, brew, etc.).
|
| 159 |
+
|
| 160 |
+
Parameters
|
| 161 |
+
----------
|
| 162 |
+
fs: file-system instance
|
| 163 |
+
From one of the compatible implementations
|
| 164 |
+
path: str
|
| 165 |
+
Location on that file-system to regard as the root directory to
|
| 166 |
+
mount. Note that you typically should include the terminating "/"
|
| 167 |
+
character.
|
| 168 |
+
mount_point: str
|
| 169 |
+
An empty directory on the local file-system where the contents of
|
| 170 |
+
the remote path will appear.
|
| 171 |
+
foreground: bool
|
| 172 |
+
Whether or not calling this function will block. Operation will
|
| 173 |
+
typically be more stable if True.
|
| 174 |
+
threads: bool
|
| 175 |
+
Whether or not to create threads when responding to file operations
|
| 176 |
+
within the mounter directory. Operation will typically be more
|
| 177 |
+
stable if False.
|
| 178 |
+
ready_file: bool
|
| 179 |
+
Whether the FUSE process is ready. The ``.fuse_ready`` file will
|
| 180 |
+
exist in the ``mount_point`` directory if True. Debugging purpose.
|
| 181 |
+
ops_class: FUSEr or Subclass of FUSEr
|
| 182 |
+
To override the default behavior of FUSEr. For Example, logging
|
| 183 |
+
to file.
|
| 184 |
+
|
| 185 |
+
"""
|
| 186 |
+
func = lambda: FUSE(
|
| 187 |
+
ops_class(fs, path, ready_file=ready_file),
|
| 188 |
+
mount_point,
|
| 189 |
+
nothreads=not threads,
|
| 190 |
+
foreground=foreground,
|
| 191 |
+
)
|
| 192 |
+
if not foreground:
|
| 193 |
+
th = threading.Thread(target=func)
|
| 194 |
+
th.daemon = True
|
| 195 |
+
th.start()
|
| 196 |
+
return th
|
| 197 |
+
else: # pragma: no cover
|
| 198 |
+
try:
|
| 199 |
+
func()
|
| 200 |
+
except KeyboardInterrupt:
|
| 201 |
+
pass
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def main(args):
|
| 205 |
+
"""Mount filesystem from chained URL to MOUNT_POINT.
|
| 206 |
+
|
| 207 |
+
Examples:
|
| 208 |
+
|
| 209 |
+
python3 -m fsspec.fuse memory /usr/share /tmp/mem
|
| 210 |
+
|
| 211 |
+
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
|
| 212 |
+
-l /tmp/fsspecfuse.log
|
| 213 |
+
|
| 214 |
+
You can also mount chained-URLs and use special settings:
|
| 215 |
+
|
| 216 |
+
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
|
| 217 |
+
/ /tmp/zip \\
|
| 218 |
+
-o 'filecache-cache_storage=/tmp/simplecache'
|
| 219 |
+
|
| 220 |
+
You can specify the type of the setting by using `[int]` or `[bool]`,
|
| 221 |
+
(`true`, `yes`, `1` represents the Boolean value `True`):
|
| 222 |
+
|
| 223 |
+
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
|
| 224 |
+
/historic/packages/RPMS /tmp/ftp \\
|
| 225 |
+
-o 'simplecache-cache_storage=/tmp/simplecache' \\
|
| 226 |
+
-o 'simplecache-check_files=false[bool]' \\
|
| 227 |
+
-o 'ftp-listings_expiry_time=60[int]' \\
|
| 228 |
+
-o 'ftp-username=anonymous' \\
|
| 229 |
+
-o 'ftp-password=xieyanbo'
|
| 230 |
+
"""
|
| 231 |
+
|
| 232 |
+
class RawDescriptionArgumentParser(argparse.ArgumentParser):
|
| 233 |
+
def format_help(self):
|
| 234 |
+
usage = super().format_help()
|
| 235 |
+
parts = usage.split("\n\n")
|
| 236 |
+
parts[1] = self.description.rstrip()
|
| 237 |
+
return "\n\n".join(parts)
|
| 238 |
+
|
| 239 |
+
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
|
| 240 |
+
parser.add_argument("--version", action="version", version=__version__)
|
| 241 |
+
parser.add_argument("url", type=str, help="fs url")
|
| 242 |
+
parser.add_argument("source_path", type=str, help="source directory in fs")
|
| 243 |
+
parser.add_argument("mount_point", type=str, help="local directory")
|
| 244 |
+
parser.add_argument(
|
| 245 |
+
"-o",
|
| 246 |
+
"--option",
|
| 247 |
+
action="append",
|
| 248 |
+
help="Any options of protocol included in the chained URL",
|
| 249 |
+
)
|
| 250 |
+
parser.add_argument(
|
| 251 |
+
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
|
| 252 |
+
)
|
| 253 |
+
parser.add_argument(
|
| 254 |
+
"-f",
|
| 255 |
+
"--foreground",
|
| 256 |
+
action="store_false",
|
| 257 |
+
help="Running in foreground or not (Default: False)",
|
| 258 |
+
)
|
| 259 |
+
parser.add_argument(
|
| 260 |
+
"-t",
|
| 261 |
+
"--threads",
|
| 262 |
+
action="store_false",
|
| 263 |
+
help="Running with threads support (Default: False)",
|
| 264 |
+
)
|
| 265 |
+
parser.add_argument(
|
| 266 |
+
"-r",
|
| 267 |
+
"--ready-file",
|
| 268 |
+
action="store_false",
|
| 269 |
+
help="The `.fuse_ready` file will exist after FUSE is ready. "
|
| 270 |
+
"(Debugging purpose, Default: False)",
|
| 271 |
+
)
|
| 272 |
+
args = parser.parse_args(args)
|
| 273 |
+
|
| 274 |
+
kwargs = {}
|
| 275 |
+
for item in args.option or []:
|
| 276 |
+
key, sep, value = item.partition("=")
|
| 277 |
+
if not sep:
|
| 278 |
+
parser.error(message=f"Wrong option: {item!r}")
|
| 279 |
+
val = value.lower()
|
| 280 |
+
if val.endswith("[int]"):
|
| 281 |
+
value = int(value[: -len("[int]")])
|
| 282 |
+
elif val.endswith("[bool]"):
|
| 283 |
+
value = val[: -len("[bool]")] in ["1", "yes", "true"]
|
| 284 |
+
|
| 285 |
+
if "-" in key:
|
| 286 |
+
fs_name, setting_name = key.split("-", 1)
|
| 287 |
+
if fs_name in kwargs:
|
| 288 |
+
kwargs[fs_name][setting_name] = value
|
| 289 |
+
else:
|
| 290 |
+
kwargs[fs_name] = {setting_name: value}
|
| 291 |
+
else:
|
| 292 |
+
kwargs[key] = value
|
| 293 |
+
|
| 294 |
+
if args.log_file:
|
| 295 |
+
logging.basicConfig(
|
| 296 |
+
level=logging.DEBUG,
|
| 297 |
+
filename=args.log_file,
|
| 298 |
+
format="%(asctime)s %(message)s",
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
class LoggingFUSEr(FUSEr, LoggingMixIn):
|
| 302 |
+
pass
|
| 303 |
+
|
| 304 |
+
fuser = LoggingFUSEr
|
| 305 |
+
else:
|
| 306 |
+
fuser = FUSEr
|
| 307 |
+
|
| 308 |
+
fs, url_path = url_to_fs(args.url, **kwargs)
|
| 309 |
+
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
|
| 310 |
+
run(
|
| 311 |
+
fs,
|
| 312 |
+
args.source_path,
|
| 313 |
+
args.mount_point,
|
| 314 |
+
foreground=args.foreground,
|
| 315 |
+
threads=args.threads,
|
| 316 |
+
ready_file=args.ready_file,
|
| 317 |
+
ops_class=fuser,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
if __name__ == "__main__":
|
| 322 |
+
import sys
|
| 323 |
+
|
| 324 |
+
main(sys.argv[1:])
|
.venv/lib/python3.11/site-packages/fsspec/generic.py
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import inspect
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import shutil
|
| 7 |
+
import uuid
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
|
| 11 |
+
from .callbacks import DEFAULT_CALLBACK
|
| 12 |
+
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
|
| 13 |
+
|
| 14 |
+
_generic_fs = {}
|
| 15 |
+
logger = logging.getLogger("fsspec.generic")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def set_generic_fs(protocol, **storage_options):
|
| 19 |
+
_generic_fs[protocol] = filesystem(protocol, **storage_options)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
default_method = "default"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _resolve_fs(url, method=None, protocol=None, storage_options=None):
|
| 26 |
+
"""Pick instance of backend FS"""
|
| 27 |
+
method = method or default_method
|
| 28 |
+
protocol = protocol or split_protocol(url)[0]
|
| 29 |
+
storage_options = storage_options or {}
|
| 30 |
+
if method == "default":
|
| 31 |
+
return filesystem(protocol)
|
| 32 |
+
if method == "generic":
|
| 33 |
+
return _generic_fs[protocol]
|
| 34 |
+
if method == "current":
|
| 35 |
+
cls = get_filesystem_class(protocol)
|
| 36 |
+
return cls.current()
|
| 37 |
+
if method == "options":
|
| 38 |
+
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
|
| 39 |
+
return fs
|
| 40 |
+
raise ValueError(f"Unknown FS resolution method: {method}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def rsync(
|
| 44 |
+
source,
|
| 45 |
+
destination,
|
| 46 |
+
delete_missing=False,
|
| 47 |
+
source_field="size",
|
| 48 |
+
dest_field="size",
|
| 49 |
+
update_cond="different",
|
| 50 |
+
inst_kwargs=None,
|
| 51 |
+
fs=None,
|
| 52 |
+
**kwargs,
|
| 53 |
+
):
|
| 54 |
+
"""Sync files between two directory trees
|
| 55 |
+
|
| 56 |
+
(experimental)
|
| 57 |
+
|
| 58 |
+
Parameters
|
| 59 |
+
----------
|
| 60 |
+
source: str
|
| 61 |
+
Root of the directory tree to take files from. This must be a directory, but
|
| 62 |
+
do not include any terminating "/" character
|
| 63 |
+
destination: str
|
| 64 |
+
Root path to copy into. The contents of this location should be
|
| 65 |
+
identical to the contents of ``source`` when done. This will be made a
|
| 66 |
+
directory, and the terminal "/" should not be included.
|
| 67 |
+
delete_missing: bool
|
| 68 |
+
If there are paths in the destination that don't exist in the
|
| 69 |
+
source and this is True, delete them. Otherwise, leave them alone.
|
| 70 |
+
source_field: str | callable
|
| 71 |
+
If ``update_field`` is "different", this is the key in the info
|
| 72 |
+
of source files to consider for difference. Maybe a function of the
|
| 73 |
+
info dict.
|
| 74 |
+
dest_field: str | callable
|
| 75 |
+
If ``update_field`` is "different", this is the key in the info
|
| 76 |
+
of destination files to consider for difference. May be a function of
|
| 77 |
+
the info dict.
|
| 78 |
+
update_cond: "different"|"always"|"never"
|
| 79 |
+
If "always", every file is copied, regardless of whether it exists in
|
| 80 |
+
the destination. If "never", files that exist in the destination are
|
| 81 |
+
not copied again. If "different" (default), only copy if the info
|
| 82 |
+
fields given by ``source_field`` and ``dest_field`` (usually "size")
|
| 83 |
+
are different. Other comparisons may be added in the future.
|
| 84 |
+
inst_kwargs: dict|None
|
| 85 |
+
If ``fs`` is None, use this set of keyword arguments to make a
|
| 86 |
+
GenericFileSystem instance
|
| 87 |
+
fs: GenericFileSystem|None
|
| 88 |
+
Instance to use if explicitly given. The instance defines how to
|
| 89 |
+
to make downstream file system instances from paths.
|
| 90 |
+
|
| 91 |
+
Returns
|
| 92 |
+
-------
|
| 93 |
+
dict of the copy operations that were performed, {source: destination}
|
| 94 |
+
"""
|
| 95 |
+
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
|
| 96 |
+
source = fs._strip_protocol(source)
|
| 97 |
+
destination = fs._strip_protocol(destination)
|
| 98 |
+
allfiles = fs.find(source, withdirs=True, detail=True)
|
| 99 |
+
if not fs.isdir(source):
|
| 100 |
+
raise ValueError("Can only rsync on a directory")
|
| 101 |
+
otherfiles = fs.find(destination, withdirs=True, detail=True)
|
| 102 |
+
dirs = [
|
| 103 |
+
a
|
| 104 |
+
for a, v in allfiles.items()
|
| 105 |
+
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
|
| 106 |
+
]
|
| 107 |
+
logger.debug(f"{len(dirs)} directories to create")
|
| 108 |
+
if dirs:
|
| 109 |
+
fs.make_many_dirs(
|
| 110 |
+
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
|
| 111 |
+
)
|
| 112 |
+
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
|
| 113 |
+
logger.debug(f"{len(allfiles)} files to consider for copy")
|
| 114 |
+
to_delete = [
|
| 115 |
+
o
|
| 116 |
+
for o, v in otherfiles.items()
|
| 117 |
+
if o.replace(destination, source) not in allfiles and v["type"] == "file"
|
| 118 |
+
]
|
| 119 |
+
for k, v in allfiles.copy().items():
|
| 120 |
+
otherfile = k.replace(source, destination)
|
| 121 |
+
if otherfile in otherfiles:
|
| 122 |
+
if update_cond == "always":
|
| 123 |
+
allfiles[k] = otherfile
|
| 124 |
+
elif update_cond == "different":
|
| 125 |
+
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
| 126 |
+
v2 = otherfiles[otherfile]
|
| 127 |
+
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
|
| 128 |
+
if inf1 != inf2:
|
| 129 |
+
# details mismatch, make copy
|
| 130 |
+
allfiles[k] = otherfile
|
| 131 |
+
else:
|
| 132 |
+
# details match, don't copy
|
| 133 |
+
allfiles.pop(k)
|
| 134 |
+
else:
|
| 135 |
+
# file not in target yet
|
| 136 |
+
allfiles[k] = otherfile
|
| 137 |
+
logger.debug(f"{len(allfiles)} files to copy")
|
| 138 |
+
if allfiles:
|
| 139 |
+
source_files, target_files = zip(*allfiles.items())
|
| 140 |
+
fs.cp(source_files, target_files, **kwargs)
|
| 141 |
+
logger.debug(f"{len(to_delete)} files to delete")
|
| 142 |
+
if delete_missing and to_delete:
|
| 143 |
+
fs.rm(to_delete)
|
| 144 |
+
return allfiles
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class GenericFileSystem(AsyncFileSystem):
|
| 148 |
+
"""Wrapper over all other FS types
|
| 149 |
+
|
| 150 |
+
<experimental!>
|
| 151 |
+
|
| 152 |
+
This implementation is a single unified interface to be able to run FS operations
|
| 153 |
+
over generic URLs, and dispatch to the specific implementations using the URL
|
| 154 |
+
protocol prefix.
|
| 155 |
+
|
| 156 |
+
Note: instances of this FS are always async, even if you never use it with any async
|
| 157 |
+
backend.
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
protocol = "generic" # there is no real reason to ever use a protocol with this FS
|
| 161 |
+
|
| 162 |
+
def __init__(self, default_method="default", **kwargs):
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
Parameters
|
| 166 |
+
----------
|
| 167 |
+
default_method: str (optional)
|
| 168 |
+
Defines how to configure backend FS instances. Options are:
|
| 169 |
+
- "default": instantiate like FSClass(), with no
|
| 170 |
+
extra arguments; this is the default instance of that FS, and can be
|
| 171 |
+
configured via the config system
|
| 172 |
+
- "generic": takes instances from the `_generic_fs` dict in this module,
|
| 173 |
+
which you must populate before use. Keys are by protocol
|
| 174 |
+
- "current": takes the most recently instantiated version of each FS
|
| 175 |
+
"""
|
| 176 |
+
self.method = default_method
|
| 177 |
+
super().__init__(**kwargs)
|
| 178 |
+
|
| 179 |
+
def _parent(self, path):
|
| 180 |
+
fs = _resolve_fs(path, self.method)
|
| 181 |
+
return fs.unstrip_protocol(fs._parent(path))
|
| 182 |
+
|
| 183 |
+
def _strip_protocol(self, path):
|
| 184 |
+
# normalization only
|
| 185 |
+
fs = _resolve_fs(path, self.method)
|
| 186 |
+
return fs.unstrip_protocol(fs._strip_protocol(path))
|
| 187 |
+
|
| 188 |
+
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
| 189 |
+
fs = _resolve_fs(path, self.method)
|
| 190 |
+
if fs.async_impl:
|
| 191 |
+
out = await fs._find(
|
| 192 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
| 193 |
+
)
|
| 194 |
+
else:
|
| 195 |
+
out = fs.find(
|
| 196 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
| 197 |
+
)
|
| 198 |
+
result = {}
|
| 199 |
+
for k, v in out.items():
|
| 200 |
+
v = v.copy() # don't corrupt target FS dircache
|
| 201 |
+
name = fs.unstrip_protocol(k)
|
| 202 |
+
v["name"] = name
|
| 203 |
+
result[name] = v
|
| 204 |
+
if detail:
|
| 205 |
+
return result
|
| 206 |
+
return list(result)
|
| 207 |
+
|
| 208 |
+
async def _info(self, url, **kwargs):
|
| 209 |
+
fs = _resolve_fs(url, self.method)
|
| 210 |
+
if fs.async_impl:
|
| 211 |
+
out = await fs._info(url, **kwargs)
|
| 212 |
+
else:
|
| 213 |
+
out = fs.info(url, **kwargs)
|
| 214 |
+
out = out.copy() # don't edit originals
|
| 215 |
+
out["name"] = fs.unstrip_protocol(out["name"])
|
| 216 |
+
return out
|
| 217 |
+
|
| 218 |
+
async def _ls(
|
| 219 |
+
self,
|
| 220 |
+
url,
|
| 221 |
+
detail=True,
|
| 222 |
+
**kwargs,
|
| 223 |
+
):
|
| 224 |
+
fs = _resolve_fs(url, self.method)
|
| 225 |
+
if fs.async_impl:
|
| 226 |
+
out = await fs._ls(url, detail=True, **kwargs)
|
| 227 |
+
else:
|
| 228 |
+
out = fs.ls(url, detail=True, **kwargs)
|
| 229 |
+
out = [o.copy() for o in out] # don't edit originals
|
| 230 |
+
for o in out:
|
| 231 |
+
o["name"] = fs.unstrip_protocol(o["name"])
|
| 232 |
+
if detail:
|
| 233 |
+
return out
|
| 234 |
+
else:
|
| 235 |
+
return [o["name"] for o in out]
|
| 236 |
+
|
| 237 |
+
async def _cat_file(
|
| 238 |
+
self,
|
| 239 |
+
url,
|
| 240 |
+
**kwargs,
|
| 241 |
+
):
|
| 242 |
+
fs = _resolve_fs(url, self.method)
|
| 243 |
+
if fs.async_impl:
|
| 244 |
+
return await fs._cat_file(url, **kwargs)
|
| 245 |
+
else:
|
| 246 |
+
return fs.cat_file(url, **kwargs)
|
| 247 |
+
|
| 248 |
+
async def _pipe_file(
|
| 249 |
+
self,
|
| 250 |
+
path,
|
| 251 |
+
value,
|
| 252 |
+
**kwargs,
|
| 253 |
+
):
|
| 254 |
+
fs = _resolve_fs(path, self.method)
|
| 255 |
+
if fs.async_impl:
|
| 256 |
+
return await fs._pipe_file(path, value, **kwargs)
|
| 257 |
+
else:
|
| 258 |
+
return fs.pipe_file(path, value, **kwargs)
|
| 259 |
+
|
| 260 |
+
async def _rm(self, url, **kwargs):
|
| 261 |
+
urls = url
|
| 262 |
+
if isinstance(urls, str):
|
| 263 |
+
urls = [urls]
|
| 264 |
+
fs = _resolve_fs(urls[0], self.method)
|
| 265 |
+
if fs.async_impl:
|
| 266 |
+
await fs._rm(urls, **kwargs)
|
| 267 |
+
else:
|
| 268 |
+
fs.rm(url, **kwargs)
|
| 269 |
+
|
| 270 |
+
async def _makedirs(self, path, exist_ok=False):
|
| 271 |
+
logger.debug("Make dir %s", path)
|
| 272 |
+
fs = _resolve_fs(path, self.method)
|
| 273 |
+
if fs.async_impl:
|
| 274 |
+
await fs._makedirs(path, exist_ok=exist_ok)
|
| 275 |
+
else:
|
| 276 |
+
fs.makedirs(path, exist_ok=exist_ok)
|
| 277 |
+
|
| 278 |
+
def rsync(self, source, destination, **kwargs):
|
| 279 |
+
"""Sync files between two directory trees
|
| 280 |
+
|
| 281 |
+
See `func:rsync` for more details.
|
| 282 |
+
"""
|
| 283 |
+
rsync(source, destination, fs=self, **kwargs)
|
| 284 |
+
|
| 285 |
+
async def _cp_file(
|
| 286 |
+
self,
|
| 287 |
+
url,
|
| 288 |
+
url2,
|
| 289 |
+
blocksize=2**20,
|
| 290 |
+
callback=DEFAULT_CALLBACK,
|
| 291 |
+
**kwargs,
|
| 292 |
+
):
|
| 293 |
+
fs = _resolve_fs(url, self.method)
|
| 294 |
+
fs2 = _resolve_fs(url2, self.method)
|
| 295 |
+
if fs is fs2:
|
| 296 |
+
# pure remote
|
| 297 |
+
if fs.async_impl:
|
| 298 |
+
return await fs._cp_file(url, url2, **kwargs)
|
| 299 |
+
else:
|
| 300 |
+
return fs.cp_file(url, url2, **kwargs)
|
| 301 |
+
kw = {"blocksize": 0, "cache_type": "none"}
|
| 302 |
+
try:
|
| 303 |
+
f1 = (
|
| 304 |
+
await fs.open_async(url, "rb")
|
| 305 |
+
if hasattr(fs, "open_async")
|
| 306 |
+
else fs.open(url, "rb", **kw)
|
| 307 |
+
)
|
| 308 |
+
callback.set_size(await maybe_await(f1.size))
|
| 309 |
+
f2 = (
|
| 310 |
+
await fs2.open_async(url2, "wb")
|
| 311 |
+
if hasattr(fs2, "open_async")
|
| 312 |
+
else fs2.open(url2, "wb", **kw)
|
| 313 |
+
)
|
| 314 |
+
while f1.size is None or f2.tell() < f1.size:
|
| 315 |
+
data = await maybe_await(f1.read(blocksize))
|
| 316 |
+
if f1.size is None and not data:
|
| 317 |
+
break
|
| 318 |
+
await maybe_await(f2.write(data))
|
| 319 |
+
callback.absolute_update(f2.tell())
|
| 320 |
+
finally:
|
| 321 |
+
try:
|
| 322 |
+
await maybe_await(f2.close())
|
| 323 |
+
await maybe_await(f1.close())
|
| 324 |
+
except NameError:
|
| 325 |
+
# fail while opening f1 or f2
|
| 326 |
+
pass
|
| 327 |
+
|
| 328 |
+
async def _make_many_dirs(self, urls, exist_ok=True):
|
| 329 |
+
fs = _resolve_fs(urls[0], self.method)
|
| 330 |
+
if fs.async_impl:
|
| 331 |
+
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
|
| 332 |
+
await _run_coros_in_chunks(coros)
|
| 333 |
+
else:
|
| 334 |
+
for u in urls:
|
| 335 |
+
fs.makedirs(u, exist_ok=exist_ok)
|
| 336 |
+
|
| 337 |
+
make_many_dirs = sync_wrapper(_make_many_dirs)
|
| 338 |
+
|
| 339 |
+
async def _copy(
|
| 340 |
+
self,
|
| 341 |
+
path1: list[str],
|
| 342 |
+
path2: list[str],
|
| 343 |
+
recursive: bool = False,
|
| 344 |
+
on_error: str = "ignore",
|
| 345 |
+
maxdepth: Optional[int] = None,
|
| 346 |
+
batch_size: Optional[int] = None,
|
| 347 |
+
tempdir: Optional[str] = None,
|
| 348 |
+
**kwargs,
|
| 349 |
+
):
|
| 350 |
+
if recursive:
|
| 351 |
+
raise NotImplementedError
|
| 352 |
+
fs = _resolve_fs(path1[0], self.method)
|
| 353 |
+
fs2 = _resolve_fs(path2[0], self.method)
|
| 354 |
+
# not expanding paths atm., assume call is from rsync()
|
| 355 |
+
if fs is fs2:
|
| 356 |
+
# pure remote
|
| 357 |
+
if fs.async_impl:
|
| 358 |
+
return await fs._copy(path1, path2, **kwargs)
|
| 359 |
+
else:
|
| 360 |
+
return fs.copy(path1, path2, **kwargs)
|
| 361 |
+
await copy_file_op(
|
| 362 |
+
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
async def copy_file_op(
|
| 367 |
+
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
|
| 368 |
+
):
|
| 369 |
+
import tempfile
|
| 370 |
+
|
| 371 |
+
tempdir = tempdir or tempfile.mkdtemp()
|
| 372 |
+
try:
|
| 373 |
+
coros = [
|
| 374 |
+
_copy_file_op(
|
| 375 |
+
fs1,
|
| 376 |
+
u1,
|
| 377 |
+
fs2,
|
| 378 |
+
u2,
|
| 379 |
+
os.path.join(tempdir, uuid.uuid4().hex),
|
| 380 |
+
on_error=on_error,
|
| 381 |
+
)
|
| 382 |
+
for u1, u2 in zip(url1, url2)
|
| 383 |
+
]
|
| 384 |
+
await _run_coros_in_chunks(coros, batch_size=batch_size)
|
| 385 |
+
finally:
|
| 386 |
+
shutil.rmtree(tempdir)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
|
| 390 |
+
ex = () if on_error == "raise" else Exception
|
| 391 |
+
logger.debug("Copy %s -> %s", url1, url2)
|
| 392 |
+
try:
|
| 393 |
+
if fs1.async_impl:
|
| 394 |
+
await fs1._get_file(url1, local)
|
| 395 |
+
else:
|
| 396 |
+
fs1.get_file(url1, local)
|
| 397 |
+
if fs2.async_impl:
|
| 398 |
+
await fs2._put_file(local, url2)
|
| 399 |
+
else:
|
| 400 |
+
fs2.put_file(local, url2)
|
| 401 |
+
os.unlink(local)
|
| 402 |
+
logger.debug("Copy %s -> %s; done", url1, url2)
|
| 403 |
+
except ex as e:
|
| 404 |
+
logger.debug("ignoring cp exception for %s: %s", url1, e)
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
async def maybe_await(cor):
|
| 408 |
+
if inspect.iscoroutine(cor):
|
| 409 |
+
return await cor
|
| 410 |
+
else:
|
| 411 |
+
return cor
|
.venv/lib/python3.11/site-packages/fsspec/gui.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import contextlib
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from typing import ClassVar, Sequence
|
| 7 |
+
|
| 8 |
+
import panel as pn
|
| 9 |
+
|
| 10 |
+
from .core import OpenFile, get_filesystem_class, split_protocol
|
| 11 |
+
from .registry import known_implementations
|
| 12 |
+
|
| 13 |
+
pn.extension()
|
| 14 |
+
logger = logging.getLogger("fsspec.gui")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SigSlot:
|
| 18 |
+
"""Signal-slot mixin, for Panel event passing
|
| 19 |
+
|
| 20 |
+
Include this class in a widget manager's superclasses to be able to
|
| 21 |
+
register events and callbacks on Panel widgets managed by that class.
|
| 22 |
+
|
| 23 |
+
The method ``_register`` should be called as widgets are added, and external
|
| 24 |
+
code should call ``connect`` to associate callbacks.
|
| 25 |
+
|
| 26 |
+
By default, all signals emit a DEBUG logging statement.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
# names of signals that this class may emit each of which must be
|
| 30 |
+
# set by _register for any new instance
|
| 31 |
+
signals: ClassVar[Sequence[str]] = []
|
| 32 |
+
# names of actions that this class may respond to
|
| 33 |
+
slots: ClassVar[Sequence[str]] = []
|
| 34 |
+
|
| 35 |
+
# each of which must be a method name
|
| 36 |
+
|
| 37 |
+
def __init__(self):
|
| 38 |
+
self._ignoring_events = False
|
| 39 |
+
self._sigs = {}
|
| 40 |
+
self._map = {}
|
| 41 |
+
self._setup()
|
| 42 |
+
|
| 43 |
+
def _setup(self):
|
| 44 |
+
"""Create GUI elements and register signals"""
|
| 45 |
+
self.panel = pn.pane.PaneBase()
|
| 46 |
+
# no signals to set up in the base class
|
| 47 |
+
|
| 48 |
+
def _register(
|
| 49 |
+
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
|
| 50 |
+
):
|
| 51 |
+
"""Watch the given attribute of a widget and assign it a named event
|
| 52 |
+
|
| 53 |
+
This is normally called at the time a widget is instantiated, in the
|
| 54 |
+
class which owns it.
|
| 55 |
+
|
| 56 |
+
Parameters
|
| 57 |
+
----------
|
| 58 |
+
widget : pn.layout.Panel or None
|
| 59 |
+
Widget to watch. If None, an anonymous signal not associated with
|
| 60 |
+
any widget.
|
| 61 |
+
name : str
|
| 62 |
+
Name of this event
|
| 63 |
+
thing : str
|
| 64 |
+
Attribute of the given widget to watch
|
| 65 |
+
log_level : int
|
| 66 |
+
When the signal is triggered, a logging event of the given level
|
| 67 |
+
will be fired in the dfviz logger.
|
| 68 |
+
auto : bool
|
| 69 |
+
If True, automatically connects with a method in this class of the
|
| 70 |
+
same name.
|
| 71 |
+
"""
|
| 72 |
+
if name not in self.signals:
|
| 73 |
+
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
|
| 74 |
+
self._sigs[name] = {
|
| 75 |
+
"widget": widget,
|
| 76 |
+
"callbacks": [],
|
| 77 |
+
"thing": thing,
|
| 78 |
+
"log": log_level,
|
| 79 |
+
}
|
| 80 |
+
wn = "-".join(
|
| 81 |
+
[
|
| 82 |
+
getattr(widget, "name", str(widget)) if widget is not None else "none",
|
| 83 |
+
thing,
|
| 84 |
+
]
|
| 85 |
+
)
|
| 86 |
+
self._map[wn] = name
|
| 87 |
+
if widget is not None:
|
| 88 |
+
widget.param.watch(self._signal, thing, onlychanged=True)
|
| 89 |
+
if auto and hasattr(self, name):
|
| 90 |
+
self.connect(name, getattr(self, name))
|
| 91 |
+
|
| 92 |
+
def _repr_mimebundle_(self, *args, **kwargs):
|
| 93 |
+
"""Display in a notebook or a server"""
|
| 94 |
+
try:
|
| 95 |
+
return self.panel._repr_mimebundle_(*args, **kwargs)
|
| 96 |
+
except (ValueError, AttributeError) as exc:
|
| 97 |
+
raise NotImplementedError(
|
| 98 |
+
"Panel does not seem to be set up properly"
|
| 99 |
+
) from exc
|
| 100 |
+
|
| 101 |
+
def connect(self, signal, slot):
|
| 102 |
+
"""Associate call back with given event
|
| 103 |
+
|
| 104 |
+
The callback must be a function which takes the "new" value of the
|
| 105 |
+
watched attribute as the only parameter. If the callback return False,
|
| 106 |
+
this cancels any further processing of the given event.
|
| 107 |
+
|
| 108 |
+
Alternatively, the callback can be a string, in which case it means
|
| 109 |
+
emitting the correspondingly-named event (i.e., connect to self)
|
| 110 |
+
"""
|
| 111 |
+
self._sigs[signal]["callbacks"].append(slot)
|
| 112 |
+
|
| 113 |
+
def _signal(self, event):
|
| 114 |
+
"""This is called by a an action on a widget
|
| 115 |
+
|
| 116 |
+
Within an self.ignore_events context, nothing happens.
|
| 117 |
+
|
| 118 |
+
Tests can execute this method by directly changing the values of
|
| 119 |
+
widget components.
|
| 120 |
+
"""
|
| 121 |
+
if not self._ignoring_events:
|
| 122 |
+
wn = "-".join([event.obj.name, event.name])
|
| 123 |
+
if wn in self._map and self._map[wn] in self._sigs:
|
| 124 |
+
self._emit(self._map[wn], event.new)
|
| 125 |
+
|
| 126 |
+
@contextlib.contextmanager
|
| 127 |
+
def ignore_events(self):
|
| 128 |
+
"""Temporarily turn off events processing in this instance
|
| 129 |
+
|
| 130 |
+
(does not propagate to children)
|
| 131 |
+
"""
|
| 132 |
+
self._ignoring_events = True
|
| 133 |
+
try:
|
| 134 |
+
yield
|
| 135 |
+
finally:
|
| 136 |
+
self._ignoring_events = False
|
| 137 |
+
|
| 138 |
+
def _emit(self, sig, value=None):
|
| 139 |
+
"""An event happened, call its callbacks
|
| 140 |
+
|
| 141 |
+
This method can be used in tests to simulate message passing without
|
| 142 |
+
directly changing visual elements.
|
| 143 |
+
|
| 144 |
+
Calling of callbacks will halt whenever one returns False.
|
| 145 |
+
"""
|
| 146 |
+
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
|
| 147 |
+
for callback in self._sigs[sig]["callbacks"]:
|
| 148 |
+
if isinstance(callback, str):
|
| 149 |
+
self._emit(callback)
|
| 150 |
+
else:
|
| 151 |
+
try:
|
| 152 |
+
# running callbacks should not break the interface
|
| 153 |
+
ret = callback(value)
|
| 154 |
+
if ret is False:
|
| 155 |
+
break
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.exception(
|
| 158 |
+
"Exception (%s) while executing callback for signal: %s",
|
| 159 |
+
e,
|
| 160 |
+
sig,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
def show(self, threads=False):
|
| 164 |
+
"""Open a new browser tab and display this instance's interface"""
|
| 165 |
+
self.panel.show(threads=threads, verbose=False)
|
| 166 |
+
return self
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class SingleSelect(SigSlot):
|
| 170 |
+
"""A multiselect which only allows you to select one item for an event"""
|
| 171 |
+
|
| 172 |
+
signals = ["_selected", "selected"] # the first is internal
|
| 173 |
+
slots = ["set_options", "set_selection", "add", "clear", "select"]
|
| 174 |
+
|
| 175 |
+
def __init__(self, **kwargs):
|
| 176 |
+
self.kwargs = kwargs
|
| 177 |
+
super().__init__()
|
| 178 |
+
|
| 179 |
+
def _setup(self):
|
| 180 |
+
self.panel = pn.widgets.MultiSelect(**self.kwargs)
|
| 181 |
+
self._register(self.panel, "_selected", "value")
|
| 182 |
+
self._register(None, "selected")
|
| 183 |
+
self.connect("_selected", self.select_one)
|
| 184 |
+
|
| 185 |
+
def _signal(self, *args, **kwargs):
|
| 186 |
+
super()._signal(*args, **kwargs)
|
| 187 |
+
|
| 188 |
+
def select_one(self, *_):
|
| 189 |
+
with self.ignore_events():
|
| 190 |
+
val = [self.panel.value[-1]] if self.panel.value else []
|
| 191 |
+
self.panel.value = val
|
| 192 |
+
self._emit("selected", self.panel.value)
|
| 193 |
+
|
| 194 |
+
def set_options(self, options):
|
| 195 |
+
self.panel.options = options
|
| 196 |
+
|
| 197 |
+
def clear(self):
|
| 198 |
+
self.panel.options = []
|
| 199 |
+
|
| 200 |
+
@property
|
| 201 |
+
def value(self):
|
| 202 |
+
return self.panel.value
|
| 203 |
+
|
| 204 |
+
def set_selection(self, selection):
|
| 205 |
+
self.panel.value = [selection]
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class FileSelector(SigSlot):
|
| 209 |
+
"""Panel-based graphical file selector widget
|
| 210 |
+
|
| 211 |
+
Instances of this widget are interactive and can be displayed in jupyter by having
|
| 212 |
+
them as the output of a cell, or in a separate browser tab using ``.show()``.
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
signals = [
|
| 216 |
+
"protocol_changed",
|
| 217 |
+
"selection_changed",
|
| 218 |
+
"directory_entered",
|
| 219 |
+
"home_clicked",
|
| 220 |
+
"up_clicked",
|
| 221 |
+
"go_clicked",
|
| 222 |
+
"filters_changed",
|
| 223 |
+
]
|
| 224 |
+
slots = ["set_filters", "go_home"]
|
| 225 |
+
|
| 226 |
+
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
Parameters
|
| 230 |
+
----------
|
| 231 |
+
url : str (optional)
|
| 232 |
+
Initial value of the URL to populate the dialog; should include protocol
|
| 233 |
+
filters : list(str) (optional)
|
| 234 |
+
File endings to include in the listings. If not included, all files are
|
| 235 |
+
allowed. Does not affect directories.
|
| 236 |
+
If given, the endings will appear as checkboxes in the interface
|
| 237 |
+
ignore : list(str) (optional)
|
| 238 |
+
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
|
| 239 |
+
hidden files on posix
|
| 240 |
+
kwargs : dict (optional)
|
| 241 |
+
To pass to file system instance
|
| 242 |
+
"""
|
| 243 |
+
if url:
|
| 244 |
+
self.init_protocol, url = split_protocol(url)
|
| 245 |
+
else:
|
| 246 |
+
self.init_protocol, url = "file", os.getcwd()
|
| 247 |
+
self.init_url = url
|
| 248 |
+
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
|
| 249 |
+
self.filters = filters
|
| 250 |
+
self.ignore = [re.compile(i) for i in ignore or []]
|
| 251 |
+
self._fs = None
|
| 252 |
+
super().__init__()
|
| 253 |
+
|
| 254 |
+
def _setup(self):
|
| 255 |
+
self.url = pn.widgets.TextInput(
|
| 256 |
+
name="url",
|
| 257 |
+
value=self.init_url,
|
| 258 |
+
align="end",
|
| 259 |
+
sizing_mode="stretch_width",
|
| 260 |
+
width_policy="max",
|
| 261 |
+
)
|
| 262 |
+
self.protocol = pn.widgets.Select(
|
| 263 |
+
options=sorted(known_implementations),
|
| 264 |
+
value=self.init_protocol,
|
| 265 |
+
name="protocol",
|
| 266 |
+
align="center",
|
| 267 |
+
)
|
| 268 |
+
self.kwargs = pn.widgets.TextInput(
|
| 269 |
+
name="kwargs", value=self.init_kwargs, align="center"
|
| 270 |
+
)
|
| 271 |
+
self.go = pn.widgets.Button(name="⇨", align="end", width=45)
|
| 272 |
+
self.main = SingleSelect(size=10)
|
| 273 |
+
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
|
| 274 |
+
self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
|
| 275 |
+
|
| 276 |
+
self._register(self.protocol, "protocol_changed", auto=True)
|
| 277 |
+
self._register(self.go, "go_clicked", "clicks", auto=True)
|
| 278 |
+
self._register(self.up, "up_clicked", "clicks", auto=True)
|
| 279 |
+
self._register(self.home, "home_clicked", "clicks", auto=True)
|
| 280 |
+
self._register(None, "selection_changed")
|
| 281 |
+
self.main.connect("selected", self.selection_changed)
|
| 282 |
+
self._register(None, "directory_entered")
|
| 283 |
+
self.prev_protocol = self.protocol.value
|
| 284 |
+
self.prev_kwargs = self.storage_options
|
| 285 |
+
|
| 286 |
+
self.filter_sel = pn.widgets.CheckBoxGroup(
|
| 287 |
+
value=[], options=[], inline=False, align="end", width_policy="min"
|
| 288 |
+
)
|
| 289 |
+
self._register(self.filter_sel, "filters_changed", auto=True)
|
| 290 |
+
|
| 291 |
+
self.panel = pn.Column(
|
| 292 |
+
pn.Row(self.protocol, self.kwargs),
|
| 293 |
+
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
|
| 294 |
+
self.main.panel,
|
| 295 |
+
)
|
| 296 |
+
self.set_filters(self.filters)
|
| 297 |
+
self.go_clicked()
|
| 298 |
+
|
| 299 |
+
def set_filters(self, filters=None):
|
| 300 |
+
self.filters = filters
|
| 301 |
+
if filters:
|
| 302 |
+
self.filter_sel.options = filters
|
| 303 |
+
self.filter_sel.value = filters
|
| 304 |
+
else:
|
| 305 |
+
self.filter_sel.options = []
|
| 306 |
+
self.filter_sel.value = []
|
| 307 |
+
|
| 308 |
+
@property
|
| 309 |
+
def storage_options(self):
|
| 310 |
+
"""Value of the kwargs box as a dictionary"""
|
| 311 |
+
return ast.literal_eval(self.kwargs.value) or {}
|
| 312 |
+
|
| 313 |
+
@property
|
| 314 |
+
def fs(self):
|
| 315 |
+
"""Current filesystem instance"""
|
| 316 |
+
if self._fs is None:
|
| 317 |
+
cls = get_filesystem_class(self.protocol.value)
|
| 318 |
+
self._fs = cls(**self.storage_options)
|
| 319 |
+
return self._fs
|
| 320 |
+
|
| 321 |
+
@property
|
| 322 |
+
def urlpath(self):
|
| 323 |
+
"""URL of currently selected item"""
|
| 324 |
+
return (
|
| 325 |
+
(f"{self.protocol.value}://{self.main.value[0]}")
|
| 326 |
+
if self.main.value
|
| 327 |
+
else None
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
def open_file(self, mode="rb", compression=None, encoding=None):
|
| 331 |
+
"""Create OpenFile instance for the currently selected item
|
| 332 |
+
|
| 333 |
+
For example, in a notebook you might do something like
|
| 334 |
+
|
| 335 |
+
.. code-block::
|
| 336 |
+
|
| 337 |
+
[ ]: sel = FileSelector(); sel
|
| 338 |
+
|
| 339 |
+
# user selects their file
|
| 340 |
+
|
| 341 |
+
[ ]: with sel.open_file('rb') as f:
|
| 342 |
+
... out = f.read()
|
| 343 |
+
|
| 344 |
+
Parameters
|
| 345 |
+
----------
|
| 346 |
+
mode: str (optional)
|
| 347 |
+
Open mode for the file.
|
| 348 |
+
compression: str (optional)
|
| 349 |
+
The interact with the file as compressed. Set to 'infer' to guess
|
| 350 |
+
compression from the file ending
|
| 351 |
+
encoding: str (optional)
|
| 352 |
+
If using text mode, use this encoding; defaults to UTF8.
|
| 353 |
+
"""
|
| 354 |
+
if self.urlpath is None:
|
| 355 |
+
raise ValueError("No file selected")
|
| 356 |
+
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
|
| 357 |
+
|
| 358 |
+
def filters_changed(self, values):
|
| 359 |
+
self.filters = values
|
| 360 |
+
self.go_clicked()
|
| 361 |
+
|
| 362 |
+
def selection_changed(self, *_):
|
| 363 |
+
if self.urlpath is None:
|
| 364 |
+
return
|
| 365 |
+
if self.fs.isdir(self.urlpath):
|
| 366 |
+
self.url.value = self.fs._strip_protocol(self.urlpath)
|
| 367 |
+
self.go_clicked()
|
| 368 |
+
|
| 369 |
+
def go_clicked(self, *_):
|
| 370 |
+
if (
|
| 371 |
+
self.prev_protocol != self.protocol.value
|
| 372 |
+
or self.prev_kwargs != self.storage_options
|
| 373 |
+
):
|
| 374 |
+
self._fs = None # causes fs to be recreated
|
| 375 |
+
self.prev_protocol = self.protocol.value
|
| 376 |
+
self.prev_kwargs = self.storage_options
|
| 377 |
+
listing = sorted(
|
| 378 |
+
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
|
| 379 |
+
)
|
| 380 |
+
listing = [
|
| 381 |
+
l
|
| 382 |
+
for l in listing
|
| 383 |
+
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
|
| 384 |
+
]
|
| 385 |
+
folders = {
|
| 386 |
+
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
| 387 |
+
for o in listing
|
| 388 |
+
if o["type"] == "directory"
|
| 389 |
+
}
|
| 390 |
+
files = {
|
| 391 |
+
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
| 392 |
+
for o in listing
|
| 393 |
+
if o["type"] == "file"
|
| 394 |
+
}
|
| 395 |
+
if self.filters:
|
| 396 |
+
files = {
|
| 397 |
+
k: v
|
| 398 |
+
for k, v in files.items()
|
| 399 |
+
if any(v.endswith(ext) for ext in self.filters)
|
| 400 |
+
}
|
| 401 |
+
self.main.set_options(dict(**folders, **files))
|
| 402 |
+
|
| 403 |
+
def protocol_changed(self, *_):
|
| 404 |
+
self._fs = None
|
| 405 |
+
self.main.options = []
|
| 406 |
+
self.url.value = ""
|
| 407 |
+
|
| 408 |
+
def home_clicked(self, *_):
|
| 409 |
+
self.protocol.value = self.init_protocol
|
| 410 |
+
self.kwargs.value = self.init_kwargs
|
| 411 |
+
self.url.value = self.init_url
|
| 412 |
+
self.go_clicked()
|
| 413 |
+
|
| 414 |
+
def up_clicked(self, *_):
|
| 415 |
+
self.url.value = self.fs._parent(self.url.value)
|
| 416 |
+
self.go_clicked()
|
.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import errno
|
| 2 |
+
import io
|
| 3 |
+
import os
|
| 4 |
+
import secrets
|
| 5 |
+
import shutil
|
| 6 |
+
from contextlib import suppress
|
| 7 |
+
from functools import cached_property, wraps
|
| 8 |
+
from urllib.parse import parse_qs
|
| 9 |
+
|
| 10 |
+
from fsspec.spec import AbstractFileSystem
|
| 11 |
+
from fsspec.utils import (
|
| 12 |
+
get_package_version_without_import,
|
| 13 |
+
infer_storage_options,
|
| 14 |
+
mirror_from,
|
| 15 |
+
tokenize,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def wrap_exceptions(func):
|
| 20 |
+
@wraps(func)
|
| 21 |
+
def wrapper(*args, **kwargs):
|
| 22 |
+
try:
|
| 23 |
+
return func(*args, **kwargs)
|
| 24 |
+
except OSError as exception:
|
| 25 |
+
if not exception.args:
|
| 26 |
+
raise
|
| 27 |
+
|
| 28 |
+
message, *args = exception.args
|
| 29 |
+
if isinstance(message, str) and "does not exist" in message:
|
| 30 |
+
raise FileNotFoundError(errno.ENOENT, message) from exception
|
| 31 |
+
else:
|
| 32 |
+
raise
|
| 33 |
+
|
| 34 |
+
return wrapper
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
PYARROW_VERSION = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ArrowFSWrapper(AbstractFileSystem):
|
| 41 |
+
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
|
| 42 |
+
|
| 43 |
+
Parameters
|
| 44 |
+
----------
|
| 45 |
+
fs : pyarrow.fs.FileSystem
|
| 46 |
+
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
root_marker = "/"
|
| 50 |
+
|
| 51 |
+
def __init__(self, fs, **kwargs):
|
| 52 |
+
global PYARROW_VERSION
|
| 53 |
+
PYARROW_VERSION = get_package_version_without_import("pyarrow")
|
| 54 |
+
self.fs = fs
|
| 55 |
+
super().__init__(**kwargs)
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def protocol(self):
|
| 59 |
+
return self.fs.type_name
|
| 60 |
+
|
| 61 |
+
@cached_property
|
| 62 |
+
def fsid(self):
|
| 63 |
+
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
|
| 64 |
+
|
| 65 |
+
@classmethod
|
| 66 |
+
def _strip_protocol(cls, path):
|
| 67 |
+
ops = infer_storage_options(path)
|
| 68 |
+
path = ops["path"]
|
| 69 |
+
if path.startswith("//"):
|
| 70 |
+
# special case for "hdfs://path" (without the triple slash)
|
| 71 |
+
path = path[1:]
|
| 72 |
+
return path
|
| 73 |
+
|
| 74 |
+
def ls(self, path, detail=False, **kwargs):
|
| 75 |
+
path = self._strip_protocol(path)
|
| 76 |
+
from pyarrow.fs import FileSelector
|
| 77 |
+
|
| 78 |
+
entries = [
|
| 79 |
+
self._make_entry(entry)
|
| 80 |
+
for entry in self.fs.get_file_info(FileSelector(path))
|
| 81 |
+
]
|
| 82 |
+
if detail:
|
| 83 |
+
return entries
|
| 84 |
+
else:
|
| 85 |
+
return [entry["name"] for entry in entries]
|
| 86 |
+
|
| 87 |
+
def info(self, path, **kwargs):
|
| 88 |
+
path = self._strip_protocol(path)
|
| 89 |
+
[info] = self.fs.get_file_info([path])
|
| 90 |
+
return self._make_entry(info)
|
| 91 |
+
|
| 92 |
+
def exists(self, path):
|
| 93 |
+
path = self._strip_protocol(path)
|
| 94 |
+
try:
|
| 95 |
+
self.info(path)
|
| 96 |
+
except FileNotFoundError:
|
| 97 |
+
return False
|
| 98 |
+
else:
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
def _make_entry(self, info):
|
| 102 |
+
from pyarrow.fs import FileType
|
| 103 |
+
|
| 104 |
+
if info.type is FileType.Directory:
|
| 105 |
+
kind = "directory"
|
| 106 |
+
elif info.type is FileType.File:
|
| 107 |
+
kind = "file"
|
| 108 |
+
elif info.type is FileType.NotFound:
|
| 109 |
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
|
| 110 |
+
else:
|
| 111 |
+
kind = "other"
|
| 112 |
+
|
| 113 |
+
return {
|
| 114 |
+
"name": info.path,
|
| 115 |
+
"size": info.size,
|
| 116 |
+
"type": kind,
|
| 117 |
+
"mtime": info.mtime,
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
@wrap_exceptions
|
| 121 |
+
def cp_file(self, path1, path2, **kwargs):
|
| 122 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
| 123 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
| 124 |
+
|
| 125 |
+
with self._open(path1, "rb") as lstream:
|
| 126 |
+
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
|
| 127 |
+
try:
|
| 128 |
+
with self.open(tmp_fname, "wb") as rstream:
|
| 129 |
+
shutil.copyfileobj(lstream, rstream)
|
| 130 |
+
self.fs.move(tmp_fname, path2)
|
| 131 |
+
except BaseException:
|
| 132 |
+
with suppress(FileNotFoundError):
|
| 133 |
+
self.fs.delete_file(tmp_fname)
|
| 134 |
+
raise
|
| 135 |
+
|
| 136 |
+
@wrap_exceptions
|
| 137 |
+
def mv(self, path1, path2, **kwargs):
|
| 138 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
| 139 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
| 140 |
+
self.fs.move(path1, path2)
|
| 141 |
+
|
| 142 |
+
@wrap_exceptions
|
| 143 |
+
def rm_file(self, path):
|
| 144 |
+
path = self._strip_protocol(path)
|
| 145 |
+
self.fs.delete_file(path)
|
| 146 |
+
|
| 147 |
+
@wrap_exceptions
|
| 148 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
| 149 |
+
path = self._strip_protocol(path).rstrip("/")
|
| 150 |
+
if self.isdir(path):
|
| 151 |
+
if recursive:
|
| 152 |
+
self.fs.delete_dir(path)
|
| 153 |
+
else:
|
| 154 |
+
raise ValueError("Can't delete directories without recursive=False")
|
| 155 |
+
else:
|
| 156 |
+
self.fs.delete_file(path)
|
| 157 |
+
|
| 158 |
+
@wrap_exceptions
|
| 159 |
+
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
|
| 160 |
+
if mode == "rb":
|
| 161 |
+
if seekable:
|
| 162 |
+
method = self.fs.open_input_file
|
| 163 |
+
else:
|
| 164 |
+
method = self.fs.open_input_stream
|
| 165 |
+
elif mode == "wb":
|
| 166 |
+
method = self.fs.open_output_stream
|
| 167 |
+
elif mode == "ab":
|
| 168 |
+
method = self.fs.open_append_stream
|
| 169 |
+
else:
|
| 170 |
+
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
|
| 171 |
+
|
| 172 |
+
_kwargs = {}
|
| 173 |
+
if mode != "rb" or not seekable:
|
| 174 |
+
if int(PYARROW_VERSION.split(".")[0]) >= 4:
|
| 175 |
+
# disable compression auto-detection
|
| 176 |
+
_kwargs["compression"] = None
|
| 177 |
+
stream = method(path, **_kwargs)
|
| 178 |
+
|
| 179 |
+
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
|
| 180 |
+
|
| 181 |
+
@wrap_exceptions
|
| 182 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
| 183 |
+
path = self._strip_protocol(path)
|
| 184 |
+
if create_parents:
|
| 185 |
+
self.makedirs(path, exist_ok=True)
|
| 186 |
+
else:
|
| 187 |
+
self.fs.create_dir(path, recursive=False)
|
| 188 |
+
|
| 189 |
+
@wrap_exceptions
|
| 190 |
+
def makedirs(self, path, exist_ok=False):
|
| 191 |
+
path = self._strip_protocol(path)
|
| 192 |
+
self.fs.create_dir(path, recursive=True)
|
| 193 |
+
|
| 194 |
+
@wrap_exceptions
|
| 195 |
+
def rmdir(self, path):
|
| 196 |
+
path = self._strip_protocol(path)
|
| 197 |
+
self.fs.delete_dir(path)
|
| 198 |
+
|
| 199 |
+
@wrap_exceptions
|
| 200 |
+
def modified(self, path):
|
| 201 |
+
path = self._strip_protocol(path)
|
| 202 |
+
return self.fs.get_file_info(path).mtime
|
| 203 |
+
|
| 204 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
| 205 |
+
kwargs["seekable"] = start not in [None, 0]
|
| 206 |
+
return super().cat_file(path, start=None, end=None, **kwargs)
|
| 207 |
+
|
| 208 |
+
def get_file(self, rpath, lpath, **kwargs):
|
| 209 |
+
kwargs["seekable"] = False
|
| 210 |
+
super().get_file(rpath, lpath, **kwargs)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@mirror_from(
|
| 214 |
+
"stream",
|
| 215 |
+
[
|
| 216 |
+
"read",
|
| 217 |
+
"seek",
|
| 218 |
+
"tell",
|
| 219 |
+
"write",
|
| 220 |
+
"readable",
|
| 221 |
+
"writable",
|
| 222 |
+
"close",
|
| 223 |
+
"size",
|
| 224 |
+
"seekable",
|
| 225 |
+
],
|
| 226 |
+
)
|
| 227 |
+
class ArrowFile(io.IOBase):
|
| 228 |
+
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
|
| 229 |
+
self.path = path
|
| 230 |
+
self.mode = mode
|
| 231 |
+
|
| 232 |
+
self.fs = fs
|
| 233 |
+
self.stream = stream
|
| 234 |
+
|
| 235 |
+
self.blocksize = self.block_size = block_size
|
| 236 |
+
self.kwargs = kwargs
|
| 237 |
+
|
| 238 |
+
def __enter__(self):
|
| 239 |
+
return self
|
| 240 |
+
|
| 241 |
+
def __exit__(self, *args):
|
| 242 |
+
return self.close()
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class HadoopFileSystem(ArrowFSWrapper):
|
| 246 |
+
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
|
| 247 |
+
to connect it's interface with fsspec"""
|
| 248 |
+
|
| 249 |
+
protocol = "hdfs"
|
| 250 |
+
|
| 251 |
+
def __init__(
|
| 252 |
+
self,
|
| 253 |
+
host="default",
|
| 254 |
+
port=0,
|
| 255 |
+
user=None,
|
| 256 |
+
kerb_ticket=None,
|
| 257 |
+
replication=3,
|
| 258 |
+
extra_conf=None,
|
| 259 |
+
**kwargs,
|
| 260 |
+
):
|
| 261 |
+
"""
|
| 262 |
+
|
| 263 |
+
Parameters
|
| 264 |
+
----------
|
| 265 |
+
host: str
|
| 266 |
+
Hostname, IP or "default" to try to read from Hadoop config
|
| 267 |
+
port: int
|
| 268 |
+
Port to connect on, or default from Hadoop config if 0
|
| 269 |
+
user: str or None
|
| 270 |
+
If given, connect as this username
|
| 271 |
+
kerb_ticket: str or None
|
| 272 |
+
If given, use this ticket for authentication
|
| 273 |
+
replication: int
|
| 274 |
+
set replication factor of file for write operations. default value is 3.
|
| 275 |
+
extra_conf: None or dict
|
| 276 |
+
Passed on to HadoopFileSystem
|
| 277 |
+
"""
|
| 278 |
+
from pyarrow.fs import HadoopFileSystem
|
| 279 |
+
|
| 280 |
+
fs = HadoopFileSystem(
|
| 281 |
+
host=host,
|
| 282 |
+
port=port,
|
| 283 |
+
user=user,
|
| 284 |
+
kerb_ticket=kerb_ticket,
|
| 285 |
+
replication=replication,
|
| 286 |
+
extra_conf=extra_conf,
|
| 287 |
+
)
|
| 288 |
+
super().__init__(fs=fs, **kwargs)
|
| 289 |
+
|
| 290 |
+
@staticmethod
|
| 291 |
+
def _get_kwargs_from_urls(path):
|
| 292 |
+
ops = infer_storage_options(path)
|
| 293 |
+
out = {}
|
| 294 |
+
if ops.get("host", None):
|
| 295 |
+
out["host"] = ops["host"]
|
| 296 |
+
if ops.get("username", None):
|
| 297 |
+
out["user"] = ops["username"]
|
| 298 |
+
if ops.get("port", None):
|
| 299 |
+
out["port"] = ops["port"]
|
| 300 |
+
if ops.get("url_query", None):
|
| 301 |
+
queries = parse_qs(ops["url_query"])
|
| 302 |
+
if queries.get("replication", None):
|
| 303 |
+
out["replication"] = int(queries["replication"][0])
|
| 304 |
+
return out
|
.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dask
|
| 2 |
+
from distributed.client import Client, _get_global_client
|
| 3 |
+
from distributed.worker import Worker
|
| 4 |
+
|
| 5 |
+
from fsspec import filesystem
|
| 6 |
+
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
| 7 |
+
from fsspec.utils import infer_storage_options
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _get_client(client):
|
| 11 |
+
if client is None:
|
| 12 |
+
return _get_global_client()
|
| 13 |
+
elif isinstance(client, Client):
|
| 14 |
+
return client
|
| 15 |
+
else:
|
| 16 |
+
# e.g., connection string
|
| 17 |
+
return Client(client)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _in_worker():
|
| 21 |
+
return bool(Worker._instances)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DaskWorkerFileSystem(AbstractFileSystem):
|
| 25 |
+
"""View files accessible to a worker as any other remote file-system
|
| 26 |
+
|
| 27 |
+
When instances are run on the worker, uses the real filesystem. When
|
| 28 |
+
run on the client, they call the worker to provide information or data.
|
| 29 |
+
|
| 30 |
+
**Warning** this implementation is experimental, and read-only for now.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
|
| 35 |
+
):
|
| 36 |
+
super().__init__(**kwargs)
|
| 37 |
+
if not (fs is None) ^ (target_protocol is None):
|
| 38 |
+
raise ValueError(
|
| 39 |
+
"Please provide one of filesystem instance (fs) or"
|
| 40 |
+
" target_protocol, not both"
|
| 41 |
+
)
|
| 42 |
+
self.target_protocol = target_protocol
|
| 43 |
+
self.target_options = target_options
|
| 44 |
+
self.worker = None
|
| 45 |
+
self.client = client
|
| 46 |
+
self.fs = fs
|
| 47 |
+
self._determine_worker()
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def _get_kwargs_from_urls(path):
|
| 51 |
+
so = infer_storage_options(path)
|
| 52 |
+
if "host" in so and "port" in so:
|
| 53 |
+
return {"client": f"{so['host']}:{so['port']}"}
|
| 54 |
+
else:
|
| 55 |
+
return {}
|
| 56 |
+
|
| 57 |
+
def _determine_worker(self):
|
| 58 |
+
if _in_worker():
|
| 59 |
+
self.worker = True
|
| 60 |
+
if self.fs is None:
|
| 61 |
+
self.fs = filesystem(
|
| 62 |
+
self.target_protocol, **(self.target_options or {})
|
| 63 |
+
)
|
| 64 |
+
else:
|
| 65 |
+
self.worker = False
|
| 66 |
+
self.client = _get_client(self.client)
|
| 67 |
+
self.rfs = dask.delayed(self)
|
| 68 |
+
|
| 69 |
+
def mkdir(self, *args, **kwargs):
|
| 70 |
+
if self.worker:
|
| 71 |
+
self.fs.mkdir(*args, **kwargs)
|
| 72 |
+
else:
|
| 73 |
+
self.rfs.mkdir(*args, **kwargs).compute()
|
| 74 |
+
|
| 75 |
+
def rm(self, *args, **kwargs):
|
| 76 |
+
if self.worker:
|
| 77 |
+
self.fs.rm(*args, **kwargs)
|
| 78 |
+
else:
|
| 79 |
+
self.rfs.rm(*args, **kwargs).compute()
|
| 80 |
+
|
| 81 |
+
def copy(self, *args, **kwargs):
|
| 82 |
+
if self.worker:
|
| 83 |
+
self.fs.copy(*args, **kwargs)
|
| 84 |
+
else:
|
| 85 |
+
self.rfs.copy(*args, **kwargs).compute()
|
| 86 |
+
|
| 87 |
+
def mv(self, *args, **kwargs):
|
| 88 |
+
if self.worker:
|
| 89 |
+
self.fs.mv(*args, **kwargs)
|
| 90 |
+
else:
|
| 91 |
+
self.rfs.mv(*args, **kwargs).compute()
|
| 92 |
+
|
| 93 |
+
def ls(self, *args, **kwargs):
|
| 94 |
+
if self.worker:
|
| 95 |
+
return self.fs.ls(*args, **kwargs)
|
| 96 |
+
else:
|
| 97 |
+
return self.rfs.ls(*args, **kwargs).compute()
|
| 98 |
+
|
| 99 |
+
def _open(
|
| 100 |
+
self,
|
| 101 |
+
path,
|
| 102 |
+
mode="rb",
|
| 103 |
+
block_size=None,
|
| 104 |
+
autocommit=True,
|
| 105 |
+
cache_options=None,
|
| 106 |
+
**kwargs,
|
| 107 |
+
):
|
| 108 |
+
if self.worker:
|
| 109 |
+
return self.fs._open(
|
| 110 |
+
path,
|
| 111 |
+
mode=mode,
|
| 112 |
+
block_size=block_size,
|
| 113 |
+
autocommit=autocommit,
|
| 114 |
+
cache_options=cache_options,
|
| 115 |
+
**kwargs,
|
| 116 |
+
)
|
| 117 |
+
else:
|
| 118 |
+
return DaskFile(
|
| 119 |
+
fs=self,
|
| 120 |
+
path=path,
|
| 121 |
+
mode=mode,
|
| 122 |
+
block_size=block_size,
|
| 123 |
+
autocommit=autocommit,
|
| 124 |
+
cache_options=cache_options,
|
| 125 |
+
**kwargs,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def fetch_range(self, path, mode, start, end):
|
| 129 |
+
if self.worker:
|
| 130 |
+
with self._open(path, mode) as f:
|
| 131 |
+
f.seek(start)
|
| 132 |
+
return f.read(end - start)
|
| 133 |
+
else:
|
| 134 |
+
return self.rfs.fetch_range(path, mode, start, end).compute()
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class DaskFile(AbstractBufferedFile):
|
| 138 |
+
def __init__(self, mode="rb", **kwargs):
|
| 139 |
+
if mode != "rb":
|
| 140 |
+
raise ValueError('Remote dask files can only be opened in "rb" mode')
|
| 141 |
+
super().__init__(**kwargs)
|
| 142 |
+
|
| 143 |
+
def _upload_chunk(self, final=False):
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
def _initiate_upload(self):
|
| 147 |
+
"""Create remote file/upload"""
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
def _fetch_range(self, start, end):
|
| 151 |
+
"""Get the specified set of bytes from remote"""
|
| 152 |
+
return self.fs.fetch_range(self.path, self.mode, start, end)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import urllib
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
import requests.exceptions
|
| 6 |
+
from requests.adapters import HTTPAdapter, Retry
|
| 7 |
+
|
| 8 |
+
from fsspec import AbstractFileSystem
|
| 9 |
+
from fsspec.spec import AbstractBufferedFile
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DatabricksException(Exception):
|
| 13 |
+
"""
|
| 14 |
+
Helper class for exceptions raised in this module.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, error_code, message):
|
| 18 |
+
"""Create a new DatabricksException"""
|
| 19 |
+
super().__init__(message)
|
| 20 |
+
|
| 21 |
+
self.error_code = error_code
|
| 22 |
+
self.message = message
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class DatabricksFileSystem(AbstractFileSystem):
|
| 26 |
+
"""
|
| 27 |
+
Get access to the Databricks filesystem implementation over HTTP.
|
| 28 |
+
Can be used inside and outside of a databricks cluster.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, instance, token, **kwargs):
|
| 32 |
+
"""
|
| 33 |
+
Create a new DatabricksFileSystem.
|
| 34 |
+
|
| 35 |
+
Parameters
|
| 36 |
+
----------
|
| 37 |
+
instance: str
|
| 38 |
+
The instance URL of the databricks cluster.
|
| 39 |
+
For example for an Azure databricks cluster, this
|
| 40 |
+
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
|
| 41 |
+
token: str
|
| 42 |
+
Your personal token. Find out more
|
| 43 |
+
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
|
| 44 |
+
"""
|
| 45 |
+
self.instance = instance
|
| 46 |
+
self.token = token
|
| 47 |
+
self.session = requests.Session()
|
| 48 |
+
self.retries = Retry(
|
| 49 |
+
total=10,
|
| 50 |
+
backoff_factor=0.05,
|
| 51 |
+
status_forcelist=[408, 429, 500, 502, 503, 504],
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
|
| 55 |
+
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
|
| 56 |
+
|
| 57 |
+
super().__init__(**kwargs)
|
| 58 |
+
|
| 59 |
+
def ls(self, path, detail=True, **kwargs):
|
| 60 |
+
"""
|
| 61 |
+
List the contents of the given path.
|
| 62 |
+
|
| 63 |
+
Parameters
|
| 64 |
+
----------
|
| 65 |
+
path: str
|
| 66 |
+
Absolute path
|
| 67 |
+
detail: bool
|
| 68 |
+
Return not only the list of filenames,
|
| 69 |
+
but also additional information on file sizes
|
| 70 |
+
and types.
|
| 71 |
+
"""
|
| 72 |
+
out = self._ls_from_cache(path)
|
| 73 |
+
if not out:
|
| 74 |
+
try:
|
| 75 |
+
r = self._send_to_api(
|
| 76 |
+
method="get", endpoint="list", json={"path": path}
|
| 77 |
+
)
|
| 78 |
+
except DatabricksException as e:
|
| 79 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 80 |
+
raise FileNotFoundError(e.message) from e
|
| 81 |
+
|
| 82 |
+
raise
|
| 83 |
+
files = r["files"]
|
| 84 |
+
out = [
|
| 85 |
+
{
|
| 86 |
+
"name": o["path"],
|
| 87 |
+
"type": "directory" if o["is_dir"] else "file",
|
| 88 |
+
"size": o["file_size"],
|
| 89 |
+
}
|
| 90 |
+
for o in files
|
| 91 |
+
]
|
| 92 |
+
self.dircache[path] = out
|
| 93 |
+
|
| 94 |
+
if detail:
|
| 95 |
+
return out
|
| 96 |
+
return [o["name"] for o in out]
|
| 97 |
+
|
| 98 |
+
def makedirs(self, path, exist_ok=True):
|
| 99 |
+
"""
|
| 100 |
+
Create a given absolute path and all of its parents.
|
| 101 |
+
|
| 102 |
+
Parameters
|
| 103 |
+
----------
|
| 104 |
+
path: str
|
| 105 |
+
Absolute path to create
|
| 106 |
+
exist_ok: bool
|
| 107 |
+
If false, checks if the folder
|
| 108 |
+
exists before creating it (and raises an
|
| 109 |
+
Exception if this is the case)
|
| 110 |
+
"""
|
| 111 |
+
if not exist_ok:
|
| 112 |
+
try:
|
| 113 |
+
# If the following succeeds, the path is already present
|
| 114 |
+
self._send_to_api(
|
| 115 |
+
method="get", endpoint="get-status", json={"path": path}
|
| 116 |
+
)
|
| 117 |
+
raise FileExistsError(f"Path {path} already exists")
|
| 118 |
+
except DatabricksException as e:
|
| 119 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 120 |
+
pass
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
|
| 124 |
+
except DatabricksException as e:
|
| 125 |
+
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
| 126 |
+
raise FileExistsError(e.message) from e
|
| 127 |
+
|
| 128 |
+
raise
|
| 129 |
+
self.invalidate_cache(self._parent(path))
|
| 130 |
+
|
| 131 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
| 132 |
+
"""
|
| 133 |
+
Create a given absolute path and all of its parents.
|
| 134 |
+
|
| 135 |
+
Parameters
|
| 136 |
+
----------
|
| 137 |
+
path: str
|
| 138 |
+
Absolute path to create
|
| 139 |
+
create_parents: bool
|
| 140 |
+
Whether to create all parents or not.
|
| 141 |
+
"False" is not implemented so far.
|
| 142 |
+
"""
|
| 143 |
+
if not create_parents:
|
| 144 |
+
raise NotImplementedError
|
| 145 |
+
|
| 146 |
+
self.mkdirs(path, **kwargs)
|
| 147 |
+
|
| 148 |
+
def rm(self, path, recursive=False, **kwargs):
|
| 149 |
+
"""
|
| 150 |
+
Remove the file or folder at the given absolute path.
|
| 151 |
+
|
| 152 |
+
Parameters
|
| 153 |
+
----------
|
| 154 |
+
path: str
|
| 155 |
+
Absolute path what to remove
|
| 156 |
+
recursive: bool
|
| 157 |
+
Recursively delete all files in a folder.
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
self._send_to_api(
|
| 161 |
+
method="post",
|
| 162 |
+
endpoint="delete",
|
| 163 |
+
json={"path": path, "recursive": recursive},
|
| 164 |
+
)
|
| 165 |
+
except DatabricksException as e:
|
| 166 |
+
# This is not really an exception, it just means
|
| 167 |
+
# not everything was deleted so far
|
| 168 |
+
if e.error_code == "PARTIAL_DELETE":
|
| 169 |
+
self.rm(path=path, recursive=recursive)
|
| 170 |
+
elif e.error_code == "IO_ERROR":
|
| 171 |
+
# Using the same exception as the os module would use here
|
| 172 |
+
raise OSError(e.message) from e
|
| 173 |
+
|
| 174 |
+
raise
|
| 175 |
+
self.invalidate_cache(self._parent(path))
|
| 176 |
+
|
| 177 |
+
def mv(
|
| 178 |
+
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
|
| 179 |
+
):
|
| 180 |
+
"""
|
| 181 |
+
Move a source to a destination path.
|
| 182 |
+
|
| 183 |
+
A note from the original [databricks API manual]
|
| 184 |
+
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
|
| 185 |
+
|
| 186 |
+
When moving a large number of files the API call will time out after
|
| 187 |
+
approximately 60s, potentially resulting in partially moved data.
|
| 188 |
+
Therefore, for operations that move more than 10k files, we strongly
|
| 189 |
+
discourage using the DBFS REST API.
|
| 190 |
+
|
| 191 |
+
Parameters
|
| 192 |
+
----------
|
| 193 |
+
source_path: str
|
| 194 |
+
From where to move (absolute path)
|
| 195 |
+
destination_path: str
|
| 196 |
+
To where to move (absolute path)
|
| 197 |
+
recursive: bool
|
| 198 |
+
Not implemented to far.
|
| 199 |
+
maxdepth:
|
| 200 |
+
Not implemented to far.
|
| 201 |
+
"""
|
| 202 |
+
if recursive:
|
| 203 |
+
raise NotImplementedError
|
| 204 |
+
if maxdepth:
|
| 205 |
+
raise NotImplementedError
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
self._send_to_api(
|
| 209 |
+
method="post",
|
| 210 |
+
endpoint="move",
|
| 211 |
+
json={"source_path": source_path, "destination_path": destination_path},
|
| 212 |
+
)
|
| 213 |
+
except DatabricksException as e:
|
| 214 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 215 |
+
raise FileNotFoundError(e.message) from e
|
| 216 |
+
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
|
| 217 |
+
raise FileExistsError(e.message) from e
|
| 218 |
+
|
| 219 |
+
raise
|
| 220 |
+
self.invalidate_cache(self._parent(source_path))
|
| 221 |
+
self.invalidate_cache(self._parent(destination_path))
|
| 222 |
+
|
| 223 |
+
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
| 224 |
+
"""
|
| 225 |
+
Overwrite the base class method to make sure to create a DBFile.
|
| 226 |
+
All arguments are copied from the base method.
|
| 227 |
+
|
| 228 |
+
Only the default blocksize is allowed.
|
| 229 |
+
"""
|
| 230 |
+
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
|
| 231 |
+
|
| 232 |
+
def _send_to_api(self, method, endpoint, json):
|
| 233 |
+
"""
|
| 234 |
+
Send the given json to the DBFS API
|
| 235 |
+
using a get or post request (specified by the argument `method`).
|
| 236 |
+
|
| 237 |
+
Parameters
|
| 238 |
+
----------
|
| 239 |
+
method: str
|
| 240 |
+
Which http method to use for communication; "get" or "post".
|
| 241 |
+
endpoint: str
|
| 242 |
+
Where to send the request to (last part of the API URL)
|
| 243 |
+
json: dict
|
| 244 |
+
Dictionary of information to send
|
| 245 |
+
"""
|
| 246 |
+
if method == "post":
|
| 247 |
+
session_call = self.session.post
|
| 248 |
+
elif method == "get":
|
| 249 |
+
session_call = self.session.get
|
| 250 |
+
else:
|
| 251 |
+
raise ValueError(f"Do not understand method {method}")
|
| 252 |
+
|
| 253 |
+
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
|
| 254 |
+
|
| 255 |
+
r = session_call(url, json=json)
|
| 256 |
+
|
| 257 |
+
# The DBFS API will return a json, also in case of an exception.
|
| 258 |
+
# We want to preserve this information as good as possible.
|
| 259 |
+
try:
|
| 260 |
+
r.raise_for_status()
|
| 261 |
+
except requests.HTTPError as e:
|
| 262 |
+
# try to extract json error message
|
| 263 |
+
# if that fails, fall back to the original exception
|
| 264 |
+
try:
|
| 265 |
+
exception_json = e.response.json()
|
| 266 |
+
except Exception:
|
| 267 |
+
raise e from None
|
| 268 |
+
|
| 269 |
+
raise DatabricksException(**exception_json) from e
|
| 270 |
+
|
| 271 |
+
return r.json()
|
| 272 |
+
|
| 273 |
+
def _create_handle(self, path, overwrite=True):
|
| 274 |
+
"""
|
| 275 |
+
Internal function to create a handle, which can be used to
|
| 276 |
+
write blocks of a file to DBFS.
|
| 277 |
+
A handle has a unique identifier which needs to be passed
|
| 278 |
+
whenever written during this transaction.
|
| 279 |
+
The handle is active for 10 minutes - after that a new
|
| 280 |
+
write transaction needs to be created.
|
| 281 |
+
Make sure to close the handle after you are finished.
|
| 282 |
+
|
| 283 |
+
Parameters
|
| 284 |
+
----------
|
| 285 |
+
path: str
|
| 286 |
+
Absolute path for this file.
|
| 287 |
+
overwrite: bool
|
| 288 |
+
If a file already exist at this location, either overwrite
|
| 289 |
+
it or raise an exception.
|
| 290 |
+
"""
|
| 291 |
+
try:
|
| 292 |
+
r = self._send_to_api(
|
| 293 |
+
method="post",
|
| 294 |
+
endpoint="create",
|
| 295 |
+
json={"path": path, "overwrite": overwrite},
|
| 296 |
+
)
|
| 297 |
+
return r["handle"]
|
| 298 |
+
except DatabricksException as e:
|
| 299 |
+
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
| 300 |
+
raise FileExistsError(e.message) from e
|
| 301 |
+
|
| 302 |
+
raise
|
| 303 |
+
|
| 304 |
+
def _close_handle(self, handle):
|
| 305 |
+
"""
|
| 306 |
+
Close a handle, which was opened by :func:`_create_handle`.
|
| 307 |
+
|
| 308 |
+
Parameters
|
| 309 |
+
----------
|
| 310 |
+
handle: str
|
| 311 |
+
Which handle to close.
|
| 312 |
+
"""
|
| 313 |
+
try:
|
| 314 |
+
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
|
| 315 |
+
except DatabricksException as e:
|
| 316 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 317 |
+
raise FileNotFoundError(e.message) from e
|
| 318 |
+
|
| 319 |
+
raise
|
| 320 |
+
|
| 321 |
+
def _add_data(self, handle, data):
|
| 322 |
+
"""
|
| 323 |
+
Upload data to an already opened file handle
|
| 324 |
+
(opened by :func:`_create_handle`).
|
| 325 |
+
The maximal allowed data size is 1MB after
|
| 326 |
+
conversion to base64.
|
| 327 |
+
Remember to close the handle when you are finished.
|
| 328 |
+
|
| 329 |
+
Parameters
|
| 330 |
+
----------
|
| 331 |
+
handle: str
|
| 332 |
+
Which handle to upload data to.
|
| 333 |
+
data: bytes
|
| 334 |
+
Block of data to add to the handle.
|
| 335 |
+
"""
|
| 336 |
+
data = base64.b64encode(data).decode()
|
| 337 |
+
try:
|
| 338 |
+
self._send_to_api(
|
| 339 |
+
method="post",
|
| 340 |
+
endpoint="add-block",
|
| 341 |
+
json={"handle": handle, "data": data},
|
| 342 |
+
)
|
| 343 |
+
except DatabricksException as e:
|
| 344 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 345 |
+
raise FileNotFoundError(e.message) from e
|
| 346 |
+
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
|
| 347 |
+
raise ValueError(e.message) from e
|
| 348 |
+
|
| 349 |
+
raise
|
| 350 |
+
|
| 351 |
+
def _get_data(self, path, start, end):
|
| 352 |
+
"""
|
| 353 |
+
Download data in bytes from a given absolute path in a block
|
| 354 |
+
from [start, start+length].
|
| 355 |
+
The maximum number of allowed bytes to read is 1MB.
|
| 356 |
+
|
| 357 |
+
Parameters
|
| 358 |
+
----------
|
| 359 |
+
path: str
|
| 360 |
+
Absolute path to download data from
|
| 361 |
+
start: int
|
| 362 |
+
Start position of the block
|
| 363 |
+
end: int
|
| 364 |
+
End position of the block
|
| 365 |
+
"""
|
| 366 |
+
try:
|
| 367 |
+
r = self._send_to_api(
|
| 368 |
+
method="get",
|
| 369 |
+
endpoint="read",
|
| 370 |
+
json={"path": path, "offset": start, "length": end - start},
|
| 371 |
+
)
|
| 372 |
+
return base64.b64decode(r["data"])
|
| 373 |
+
except DatabricksException as e:
|
| 374 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
| 375 |
+
raise FileNotFoundError(e.message) from e
|
| 376 |
+
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
|
| 377 |
+
raise ValueError(e.message) from e
|
| 378 |
+
|
| 379 |
+
raise
|
| 380 |
+
|
| 381 |
+
def invalidate_cache(self, path=None):
|
| 382 |
+
if path is None:
|
| 383 |
+
self.dircache.clear()
|
| 384 |
+
else:
|
| 385 |
+
self.dircache.pop(path, None)
|
| 386 |
+
super().invalidate_cache(path)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
class DatabricksFile(AbstractBufferedFile):
|
| 390 |
+
"""
|
| 391 |
+
Helper class for files referenced in the DatabricksFileSystem.
|
| 392 |
+
"""
|
| 393 |
+
|
| 394 |
+
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
|
| 395 |
+
|
| 396 |
+
def __init__(
|
| 397 |
+
self,
|
| 398 |
+
fs,
|
| 399 |
+
path,
|
| 400 |
+
mode="rb",
|
| 401 |
+
block_size="default",
|
| 402 |
+
autocommit=True,
|
| 403 |
+
cache_type="readahead",
|
| 404 |
+
cache_options=None,
|
| 405 |
+
**kwargs,
|
| 406 |
+
):
|
| 407 |
+
"""
|
| 408 |
+
Create a new instance of the DatabricksFile.
|
| 409 |
+
|
| 410 |
+
The blocksize needs to be the default one.
|
| 411 |
+
"""
|
| 412 |
+
if block_size is None or block_size == "default":
|
| 413 |
+
block_size = self.DEFAULT_BLOCK_SIZE
|
| 414 |
+
|
| 415 |
+
assert block_size == self.DEFAULT_BLOCK_SIZE, (
|
| 416 |
+
f"Only the default block size is allowed, not {block_size}"
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
super().__init__(
|
| 420 |
+
fs,
|
| 421 |
+
path,
|
| 422 |
+
mode=mode,
|
| 423 |
+
block_size=block_size,
|
| 424 |
+
autocommit=autocommit,
|
| 425 |
+
cache_type=cache_type,
|
| 426 |
+
cache_options=cache_options or {},
|
| 427 |
+
**kwargs,
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
def _initiate_upload(self):
|
| 431 |
+
"""Internal function to start a file upload"""
|
| 432 |
+
self.handle = self.fs._create_handle(self.path)
|
| 433 |
+
|
| 434 |
+
def _upload_chunk(self, final=False):
|
| 435 |
+
"""Internal function to add a chunk of data to a started upload"""
|
| 436 |
+
self.buffer.seek(0)
|
| 437 |
+
data = self.buffer.getvalue()
|
| 438 |
+
|
| 439 |
+
data_chunks = [
|
| 440 |
+
data[start:end] for start, end in self._to_sized_blocks(len(data))
|
| 441 |
+
]
|
| 442 |
+
|
| 443 |
+
for data_chunk in data_chunks:
|
| 444 |
+
self.fs._add_data(handle=self.handle, data=data_chunk)
|
| 445 |
+
|
| 446 |
+
if final:
|
| 447 |
+
self.fs._close_handle(handle=self.handle)
|
| 448 |
+
return True
|
| 449 |
+
|
| 450 |
+
def _fetch_range(self, start, end):
|
| 451 |
+
"""Internal function to download a block of data"""
|
| 452 |
+
return_buffer = b""
|
| 453 |
+
length = end - start
|
| 454 |
+
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
|
| 455 |
+
return_buffer += self.fs._get_data(
|
| 456 |
+
path=self.path, start=chunk_start, end=chunk_end
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
return return_buffer
|
| 460 |
+
|
| 461 |
+
def _to_sized_blocks(self, length, start=0):
|
| 462 |
+
"""Helper function to split a range from 0 to total_length into bloksizes"""
|
| 463 |
+
end = start + length
|
| 464 |
+
for data_chunk in range(start, end, self.blocksize):
|
| 465 |
+
data_start = data_chunk
|
| 466 |
+
data_end = min(end, data_chunk + self.blocksize)
|
| 467 |
+
yield data_start, data_end
|
.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .. import filesystem
|
| 2 |
+
from ..asyn import AsyncFileSystem
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class DirFileSystem(AsyncFileSystem):
|
| 6 |
+
"""Directory prefix filesystem
|
| 7 |
+
|
| 8 |
+
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
| 9 |
+
is relative to the `path`. After performing the necessary paths operation it
|
| 10 |
+
delegates everything to the wrapped filesystem.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
protocol = "dir"
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
path=None,
|
| 18 |
+
fs=None,
|
| 19 |
+
fo=None,
|
| 20 |
+
target_protocol=None,
|
| 21 |
+
target_options=None,
|
| 22 |
+
**storage_options,
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Parameters
|
| 26 |
+
----------
|
| 27 |
+
path: str
|
| 28 |
+
Path to the directory.
|
| 29 |
+
fs: AbstractFileSystem
|
| 30 |
+
An instantiated filesystem to wrap.
|
| 31 |
+
target_protocol, target_options:
|
| 32 |
+
if fs is none, construct it from these
|
| 33 |
+
fo: str
|
| 34 |
+
Alternate for path; do not provide both
|
| 35 |
+
"""
|
| 36 |
+
super().__init__(**storage_options)
|
| 37 |
+
if fs is None:
|
| 38 |
+
fs = filesystem(protocol=target_protocol, **(target_options or {}))
|
| 39 |
+
if (path is not None) ^ (fo is not None) is False:
|
| 40 |
+
raise ValueError("Provide path or fo, not both")
|
| 41 |
+
path = path or fo
|
| 42 |
+
|
| 43 |
+
if self.asynchronous and not fs.async_impl:
|
| 44 |
+
raise ValueError("can't use asynchronous with non-async fs")
|
| 45 |
+
|
| 46 |
+
if fs.async_impl and self.asynchronous != fs.asynchronous:
|
| 47 |
+
raise ValueError("both dirfs and fs should be in the same sync/async mode")
|
| 48 |
+
|
| 49 |
+
self.path = fs._strip_protocol(path)
|
| 50 |
+
self.fs = fs
|
| 51 |
+
|
| 52 |
+
def _join(self, path):
|
| 53 |
+
if isinstance(path, str):
|
| 54 |
+
if not self.path:
|
| 55 |
+
return path
|
| 56 |
+
if not path:
|
| 57 |
+
return self.path
|
| 58 |
+
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
| 59 |
+
if isinstance(path, dict):
|
| 60 |
+
return {self._join(_path): value for _path, value in path.items()}
|
| 61 |
+
return [self._join(_path) for _path in path]
|
| 62 |
+
|
| 63 |
+
def _relpath(self, path):
|
| 64 |
+
if isinstance(path, str):
|
| 65 |
+
if not self.path:
|
| 66 |
+
return path
|
| 67 |
+
# We need to account for S3FileSystem returning paths that do not
|
| 68 |
+
# start with a '/'
|
| 69 |
+
if path == self.path or (
|
| 70 |
+
self.path.startswith(self.fs.sep) and path == self.path[1:]
|
| 71 |
+
):
|
| 72 |
+
return ""
|
| 73 |
+
prefix = self.path + self.fs.sep
|
| 74 |
+
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
|
| 75 |
+
prefix = prefix[1:]
|
| 76 |
+
assert path.startswith(prefix)
|
| 77 |
+
return path[len(prefix) :]
|
| 78 |
+
return [self._relpath(_path) for _path in path]
|
| 79 |
+
|
| 80 |
+
# Wrappers below
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def sep(self):
|
| 84 |
+
return self.fs.sep
|
| 85 |
+
|
| 86 |
+
async def set_session(self, *args, **kwargs):
|
| 87 |
+
return await self.fs.set_session(*args, **kwargs)
|
| 88 |
+
|
| 89 |
+
async def _rm_file(self, path, **kwargs):
|
| 90 |
+
return await self.fs._rm_file(self._join(path), **kwargs)
|
| 91 |
+
|
| 92 |
+
def rm_file(self, path, **kwargs):
|
| 93 |
+
return self.fs.rm_file(self._join(path), **kwargs)
|
| 94 |
+
|
| 95 |
+
async def _rm(self, path, *args, **kwargs):
|
| 96 |
+
return await self.fs._rm(self._join(path), *args, **kwargs)
|
| 97 |
+
|
| 98 |
+
def rm(self, path, *args, **kwargs):
|
| 99 |
+
return self.fs.rm(self._join(path), *args, **kwargs)
|
| 100 |
+
|
| 101 |
+
async def _cp_file(self, path1, path2, **kwargs):
|
| 102 |
+
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
|
| 103 |
+
|
| 104 |
+
def cp_file(self, path1, path2, **kwargs):
|
| 105 |
+
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
|
| 106 |
+
|
| 107 |
+
async def _copy(
|
| 108 |
+
self,
|
| 109 |
+
path1,
|
| 110 |
+
path2,
|
| 111 |
+
*args,
|
| 112 |
+
**kwargs,
|
| 113 |
+
):
|
| 114 |
+
return await self.fs._copy(
|
| 115 |
+
self._join(path1),
|
| 116 |
+
self._join(path2),
|
| 117 |
+
*args,
|
| 118 |
+
**kwargs,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
def copy(self, path1, path2, *args, **kwargs):
|
| 122 |
+
return self.fs.copy(
|
| 123 |
+
self._join(path1),
|
| 124 |
+
self._join(path2),
|
| 125 |
+
*args,
|
| 126 |
+
**kwargs,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
async def _pipe(self, path, *args, **kwargs):
|
| 130 |
+
return await self.fs._pipe(self._join(path), *args, **kwargs)
|
| 131 |
+
|
| 132 |
+
def pipe(self, path, *args, **kwargs):
|
| 133 |
+
return self.fs.pipe(self._join(path), *args, **kwargs)
|
| 134 |
+
|
| 135 |
+
async def _pipe_file(self, path, *args, **kwargs):
|
| 136 |
+
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
|
| 137 |
+
|
| 138 |
+
def pipe_file(self, path, *args, **kwargs):
|
| 139 |
+
return self.fs.pipe_file(self._join(path), *args, **kwargs)
|
| 140 |
+
|
| 141 |
+
async def _cat_file(self, path, *args, **kwargs):
|
| 142 |
+
return await self.fs._cat_file(self._join(path), *args, **kwargs)
|
| 143 |
+
|
| 144 |
+
def cat_file(self, path, *args, **kwargs):
|
| 145 |
+
return self.fs.cat_file(self._join(path), *args, **kwargs)
|
| 146 |
+
|
| 147 |
+
async def _cat(self, path, *args, **kwargs):
|
| 148 |
+
ret = await self.fs._cat(
|
| 149 |
+
self._join(path),
|
| 150 |
+
*args,
|
| 151 |
+
**kwargs,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
if isinstance(ret, dict):
|
| 155 |
+
return {self._relpath(key): value for key, value in ret.items()}
|
| 156 |
+
|
| 157 |
+
return ret
|
| 158 |
+
|
| 159 |
+
def cat(self, path, *args, **kwargs):
|
| 160 |
+
ret = self.fs.cat(
|
| 161 |
+
self._join(path),
|
| 162 |
+
*args,
|
| 163 |
+
**kwargs,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
if isinstance(ret, dict):
|
| 167 |
+
return {self._relpath(key): value for key, value in ret.items()}
|
| 168 |
+
|
| 169 |
+
return ret
|
| 170 |
+
|
| 171 |
+
async def _put_file(self, lpath, rpath, **kwargs):
|
| 172 |
+
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
|
| 173 |
+
|
| 174 |
+
def put_file(self, lpath, rpath, **kwargs):
|
| 175 |
+
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
|
| 176 |
+
|
| 177 |
+
async def _put(
|
| 178 |
+
self,
|
| 179 |
+
lpath,
|
| 180 |
+
rpath,
|
| 181 |
+
*args,
|
| 182 |
+
**kwargs,
|
| 183 |
+
):
|
| 184 |
+
return await self.fs._put(
|
| 185 |
+
lpath,
|
| 186 |
+
self._join(rpath),
|
| 187 |
+
*args,
|
| 188 |
+
**kwargs,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
def put(self, lpath, rpath, *args, **kwargs):
|
| 192 |
+
return self.fs.put(
|
| 193 |
+
lpath,
|
| 194 |
+
self._join(rpath),
|
| 195 |
+
*args,
|
| 196 |
+
**kwargs,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
async def _get_file(self, rpath, lpath, **kwargs):
|
| 200 |
+
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
|
| 201 |
+
|
| 202 |
+
def get_file(self, rpath, lpath, **kwargs):
|
| 203 |
+
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
|
| 204 |
+
|
| 205 |
+
async def _get(self, rpath, *args, **kwargs):
|
| 206 |
+
return await self.fs._get(self._join(rpath), *args, **kwargs)
|
| 207 |
+
|
| 208 |
+
def get(self, rpath, *args, **kwargs):
|
| 209 |
+
return self.fs.get(self._join(rpath), *args, **kwargs)
|
| 210 |
+
|
| 211 |
+
async def _isfile(self, path):
|
| 212 |
+
return await self.fs._isfile(self._join(path))
|
| 213 |
+
|
| 214 |
+
def isfile(self, path):
|
| 215 |
+
return self.fs.isfile(self._join(path))
|
| 216 |
+
|
| 217 |
+
async def _isdir(self, path):
|
| 218 |
+
return await self.fs._isdir(self._join(path))
|
| 219 |
+
|
| 220 |
+
def isdir(self, path):
|
| 221 |
+
return self.fs.isdir(self._join(path))
|
| 222 |
+
|
| 223 |
+
async def _size(self, path):
|
| 224 |
+
return await self.fs._size(self._join(path))
|
| 225 |
+
|
| 226 |
+
def size(self, path):
|
| 227 |
+
return self.fs.size(self._join(path))
|
| 228 |
+
|
| 229 |
+
async def _exists(self, path):
|
| 230 |
+
return await self.fs._exists(self._join(path))
|
| 231 |
+
|
| 232 |
+
def exists(self, path):
|
| 233 |
+
return self.fs.exists(self._join(path))
|
| 234 |
+
|
| 235 |
+
async def _info(self, path, **kwargs):
|
| 236 |
+
return await self.fs._info(self._join(path), **kwargs)
|
| 237 |
+
|
| 238 |
+
def info(self, path, **kwargs):
|
| 239 |
+
return self.fs.info(self._join(path), **kwargs)
|
| 240 |
+
|
| 241 |
+
async def _ls(self, path, detail=True, **kwargs):
|
| 242 |
+
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
|
| 243 |
+
if detail:
|
| 244 |
+
out = []
|
| 245 |
+
for entry in ret:
|
| 246 |
+
entry = entry.copy()
|
| 247 |
+
entry["name"] = self._relpath(entry["name"])
|
| 248 |
+
out.append(entry)
|
| 249 |
+
return out
|
| 250 |
+
|
| 251 |
+
return self._relpath(ret)
|
| 252 |
+
|
| 253 |
+
def ls(self, path, detail=True, **kwargs):
|
| 254 |
+
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
|
| 255 |
+
if detail:
|
| 256 |
+
out = []
|
| 257 |
+
for entry in ret:
|
| 258 |
+
entry = entry.copy()
|
| 259 |
+
entry["name"] = self._relpath(entry["name"])
|
| 260 |
+
out.append(entry)
|
| 261 |
+
return out
|
| 262 |
+
|
| 263 |
+
return self._relpath(ret)
|
| 264 |
+
|
| 265 |
+
async def _walk(self, path, *args, **kwargs):
|
| 266 |
+
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
|
| 267 |
+
yield self._relpath(root), dirs, files
|
| 268 |
+
|
| 269 |
+
def walk(self, path, *args, **kwargs):
|
| 270 |
+
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
|
| 271 |
+
yield self._relpath(root), dirs, files
|
| 272 |
+
|
| 273 |
+
async def _glob(self, path, **kwargs):
|
| 274 |
+
detail = kwargs.get("detail", False)
|
| 275 |
+
ret = await self.fs._glob(self._join(path), **kwargs)
|
| 276 |
+
if detail:
|
| 277 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
| 278 |
+
return self._relpath(ret)
|
| 279 |
+
|
| 280 |
+
def glob(self, path, **kwargs):
|
| 281 |
+
detail = kwargs.get("detail", False)
|
| 282 |
+
ret = self.fs.glob(self._join(path), **kwargs)
|
| 283 |
+
if detail:
|
| 284 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
| 285 |
+
return self._relpath(ret)
|
| 286 |
+
|
| 287 |
+
async def _du(self, path, *args, **kwargs):
|
| 288 |
+
total = kwargs.get("total", True)
|
| 289 |
+
ret = await self.fs._du(self._join(path), *args, **kwargs)
|
| 290 |
+
if total:
|
| 291 |
+
return ret
|
| 292 |
+
|
| 293 |
+
return {self._relpath(path): size for path, size in ret.items()}
|
| 294 |
+
|
| 295 |
+
def du(self, path, *args, **kwargs):
|
| 296 |
+
total = kwargs.get("total", True)
|
| 297 |
+
ret = self.fs.du(self._join(path), *args, **kwargs)
|
| 298 |
+
if total:
|
| 299 |
+
return ret
|
| 300 |
+
|
| 301 |
+
return {self._relpath(path): size for path, size in ret.items()}
|
| 302 |
+
|
| 303 |
+
async def _find(self, path, *args, **kwargs):
|
| 304 |
+
detail = kwargs.get("detail", False)
|
| 305 |
+
ret = await self.fs._find(self._join(path), *args, **kwargs)
|
| 306 |
+
if detail:
|
| 307 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
| 308 |
+
return self._relpath(ret)
|
| 309 |
+
|
| 310 |
+
def find(self, path, *args, **kwargs):
|
| 311 |
+
detail = kwargs.get("detail", False)
|
| 312 |
+
ret = self.fs.find(self._join(path), *args, **kwargs)
|
| 313 |
+
if detail:
|
| 314 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
| 315 |
+
return self._relpath(ret)
|
| 316 |
+
|
| 317 |
+
async def _expand_path(self, path, *args, **kwargs):
|
| 318 |
+
return self._relpath(
|
| 319 |
+
await self.fs._expand_path(self._join(path), *args, **kwargs)
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
def expand_path(self, path, *args, **kwargs):
|
| 323 |
+
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
|
| 324 |
+
|
| 325 |
+
async def _mkdir(self, path, *args, **kwargs):
|
| 326 |
+
return await self.fs._mkdir(self._join(path), *args, **kwargs)
|
| 327 |
+
|
| 328 |
+
def mkdir(self, path, *args, **kwargs):
|
| 329 |
+
return self.fs.mkdir(self._join(path), *args, **kwargs)
|
| 330 |
+
|
| 331 |
+
async def _makedirs(self, path, *args, **kwargs):
|
| 332 |
+
return await self.fs._makedirs(self._join(path), *args, **kwargs)
|
| 333 |
+
|
| 334 |
+
def makedirs(self, path, *args, **kwargs):
|
| 335 |
+
return self.fs.makedirs(self._join(path), *args, **kwargs)
|
| 336 |
+
|
| 337 |
+
def rmdir(self, path):
|
| 338 |
+
return self.fs.rmdir(self._join(path))
|
| 339 |
+
|
| 340 |
+
def mv(self, path1, path2, **kwargs):
|
| 341 |
+
return self.fs.mv(
|
| 342 |
+
self._join(path1),
|
| 343 |
+
self._join(path2),
|
| 344 |
+
**kwargs,
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
def touch(self, path, **kwargs):
|
| 348 |
+
return self.fs.touch(self._join(path), **kwargs)
|
| 349 |
+
|
| 350 |
+
def created(self, path):
|
| 351 |
+
return self.fs.created(self._join(path))
|
| 352 |
+
|
| 353 |
+
def modified(self, path):
|
| 354 |
+
return self.fs.modified(self._join(path))
|
| 355 |
+
|
| 356 |
+
def sign(self, path, *args, **kwargs):
|
| 357 |
+
return self.fs.sign(self._join(path), *args, **kwargs)
|
| 358 |
+
|
| 359 |
+
def __repr__(self):
|
| 360 |
+
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
|
| 361 |
+
|
| 362 |
+
def open(
|
| 363 |
+
self,
|
| 364 |
+
path,
|
| 365 |
+
*args,
|
| 366 |
+
**kwargs,
|
| 367 |
+
):
|
| 368 |
+
return self.fs.open(
|
| 369 |
+
self._join(path),
|
| 370 |
+
*args,
|
| 371 |
+
**kwargs,
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
async def open_async(
|
| 375 |
+
self,
|
| 376 |
+
path,
|
| 377 |
+
*args,
|
| 378 |
+
**kwargs,
|
| 379 |
+
):
|
| 380 |
+
return await self.fs.open_async(
|
| 381 |
+
self._join(path),
|
| 382 |
+
*args,
|
| 383 |
+
**kwargs,
|
| 384 |
+
)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import io
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
import fsspec
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class JupyterFileSystem(fsspec.AbstractFileSystem):
|
| 11 |
+
"""View of the files as seen by a Jupyter server (notebook or lab)"""
|
| 12 |
+
|
| 13 |
+
protocol = ("jupyter", "jlab")
|
| 14 |
+
|
| 15 |
+
def __init__(self, url, tok=None, **kwargs):
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
Parameters
|
| 19 |
+
----------
|
| 20 |
+
url : str
|
| 21 |
+
Base URL of the server, like "http://127.0.0.1:8888". May include
|
| 22 |
+
token in the string, which is given by the process when starting up
|
| 23 |
+
tok : str
|
| 24 |
+
If the token is obtained separately, can be given here
|
| 25 |
+
kwargs
|
| 26 |
+
"""
|
| 27 |
+
if "?" in url:
|
| 28 |
+
if tok is None:
|
| 29 |
+
try:
|
| 30 |
+
tok = re.findall("token=([a-z0-9]+)", url)[0]
|
| 31 |
+
except IndexError as e:
|
| 32 |
+
raise ValueError("Could not determine token") from e
|
| 33 |
+
url = url.split("?", 1)[0]
|
| 34 |
+
self.url = url.rstrip("/") + "/api/contents"
|
| 35 |
+
self.session = requests.Session()
|
| 36 |
+
if tok:
|
| 37 |
+
self.session.headers["Authorization"] = f"token {tok}"
|
| 38 |
+
|
| 39 |
+
super().__init__(**kwargs)
|
| 40 |
+
|
| 41 |
+
def ls(self, path, detail=True, **kwargs):
|
| 42 |
+
path = self._strip_protocol(path)
|
| 43 |
+
r = self.session.get(f"{self.url}/{path}")
|
| 44 |
+
if r.status_code == 404:
|
| 45 |
+
return FileNotFoundError(path)
|
| 46 |
+
r.raise_for_status()
|
| 47 |
+
out = r.json()
|
| 48 |
+
|
| 49 |
+
if out["type"] == "directory":
|
| 50 |
+
out = out["content"]
|
| 51 |
+
else:
|
| 52 |
+
out = [out]
|
| 53 |
+
for o in out:
|
| 54 |
+
o["name"] = o.pop("path")
|
| 55 |
+
o.pop("content")
|
| 56 |
+
if o["type"] == "notebook":
|
| 57 |
+
o["type"] = "file"
|
| 58 |
+
if detail:
|
| 59 |
+
return out
|
| 60 |
+
return [o["name"] for o in out]
|
| 61 |
+
|
| 62 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
| 63 |
+
path = self._strip_protocol(path)
|
| 64 |
+
r = self.session.get(f"{self.url}/{path}")
|
| 65 |
+
if r.status_code == 404:
|
| 66 |
+
return FileNotFoundError(path)
|
| 67 |
+
r.raise_for_status()
|
| 68 |
+
out = r.json()
|
| 69 |
+
if out["format"] == "text":
|
| 70 |
+
# data should be binary
|
| 71 |
+
b = out["content"].encode()
|
| 72 |
+
else:
|
| 73 |
+
b = base64.b64decode(out["content"])
|
| 74 |
+
return b[start:end]
|
| 75 |
+
|
| 76 |
+
def pipe_file(self, path, value, **_):
|
| 77 |
+
path = self._strip_protocol(path)
|
| 78 |
+
json = {
|
| 79 |
+
"name": path.rsplit("/", 1)[-1],
|
| 80 |
+
"path": path,
|
| 81 |
+
"size": len(value),
|
| 82 |
+
"content": base64.b64encode(value).decode(),
|
| 83 |
+
"format": "base64",
|
| 84 |
+
"type": "file",
|
| 85 |
+
}
|
| 86 |
+
self.session.put(f"{self.url}/{path}", json=json)
|
| 87 |
+
|
| 88 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
| 89 |
+
path = self._strip_protocol(path)
|
| 90 |
+
if create_parents and "/" in path:
|
| 91 |
+
self.mkdir(path.rsplit("/", 1)[0], True)
|
| 92 |
+
json = {
|
| 93 |
+
"name": path.rsplit("/", 1)[-1],
|
| 94 |
+
"path": path,
|
| 95 |
+
"size": None,
|
| 96 |
+
"content": None,
|
| 97 |
+
"type": "directory",
|
| 98 |
+
}
|
| 99 |
+
self.session.put(f"{self.url}/{path}", json=json)
|
| 100 |
+
|
| 101 |
+
def _rm(self, path):
|
| 102 |
+
path = self._strip_protocol(path)
|
| 103 |
+
self.session.delete(f"{self.url}/{path}")
|
| 104 |
+
|
| 105 |
+
def _open(self, path, mode="rb", **kwargs):
|
| 106 |
+
path = self._strip_protocol(path)
|
| 107 |
+
if mode == "rb":
|
| 108 |
+
data = self.cat_file(path)
|
| 109 |
+
return io.BytesIO(data)
|
| 110 |
+
else:
|
| 111 |
+
return SimpleFileWriter(self, path, mode="wb")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
|
| 115 |
+
def _upload_chunk(self, final=False):
|
| 116 |
+
"""Never uploads a chunk until file is done
|
| 117 |
+
|
| 118 |
+
Not suitable for large files
|
| 119 |
+
"""
|
| 120 |
+
if final is False:
|
| 121 |
+
return False
|
| 122 |
+
self.buffer.seek(0)
|
| 123 |
+
data = self.buffer.read()
|
| 124 |
+
self.fs.pipe_file(self.path, data)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/local.py
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import io
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import os.path as osp
|
| 6 |
+
import shutil
|
| 7 |
+
import stat
|
| 8 |
+
import tempfile
|
| 9 |
+
|
| 10 |
+
from fsspec import AbstractFileSystem
|
| 11 |
+
from fsspec.compression import compr
|
| 12 |
+
from fsspec.core import get_compression
|
| 13 |
+
from fsspec.utils import isfilelike, stringify_path
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger("fsspec.local")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LocalFileSystem(AbstractFileSystem):
|
| 19 |
+
"""Interface to files on local storage
|
| 20 |
+
|
| 21 |
+
Parameters
|
| 22 |
+
----------
|
| 23 |
+
auto_mkdir: bool
|
| 24 |
+
Whether, when opening a file, the directory containing it should
|
| 25 |
+
be created (if it doesn't already exist). This is assumed by pyarrow
|
| 26 |
+
code.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
root_marker = "/"
|
| 30 |
+
protocol = "file", "local"
|
| 31 |
+
local_file = True
|
| 32 |
+
|
| 33 |
+
def __init__(self, auto_mkdir=False, **kwargs):
|
| 34 |
+
super().__init__(**kwargs)
|
| 35 |
+
self.auto_mkdir = auto_mkdir
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def fsid(self):
|
| 39 |
+
return "local"
|
| 40 |
+
|
| 41 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
| 42 |
+
path = self._strip_protocol(path)
|
| 43 |
+
if self.exists(path):
|
| 44 |
+
raise FileExistsError(path)
|
| 45 |
+
if create_parents:
|
| 46 |
+
self.makedirs(path, exist_ok=True)
|
| 47 |
+
else:
|
| 48 |
+
os.mkdir(path, **kwargs)
|
| 49 |
+
|
| 50 |
+
def makedirs(self, path, exist_ok=False):
|
| 51 |
+
path = self._strip_protocol(path)
|
| 52 |
+
os.makedirs(path, exist_ok=exist_ok)
|
| 53 |
+
|
| 54 |
+
def rmdir(self, path):
|
| 55 |
+
path = self._strip_protocol(path)
|
| 56 |
+
os.rmdir(path)
|
| 57 |
+
|
| 58 |
+
def ls(self, path, detail=False, **kwargs):
|
| 59 |
+
path = self._strip_protocol(path)
|
| 60 |
+
info = self.info(path)
|
| 61 |
+
if info["type"] == "directory":
|
| 62 |
+
with os.scandir(path) as it:
|
| 63 |
+
infos = []
|
| 64 |
+
for f in it:
|
| 65 |
+
try:
|
| 66 |
+
infos.append(self.info(f))
|
| 67 |
+
except FileNotFoundError:
|
| 68 |
+
pass
|
| 69 |
+
else:
|
| 70 |
+
infos = [info]
|
| 71 |
+
|
| 72 |
+
if not detail:
|
| 73 |
+
return [i["name"] for i in infos]
|
| 74 |
+
return infos
|
| 75 |
+
|
| 76 |
+
def info(self, path, **kwargs):
|
| 77 |
+
if isinstance(path, os.DirEntry):
|
| 78 |
+
# scandir DirEntry
|
| 79 |
+
out = path.stat(follow_symlinks=False)
|
| 80 |
+
link = path.is_symlink()
|
| 81 |
+
if path.is_dir(follow_symlinks=False):
|
| 82 |
+
t = "directory"
|
| 83 |
+
elif path.is_file(follow_symlinks=False):
|
| 84 |
+
t = "file"
|
| 85 |
+
else:
|
| 86 |
+
t = "other"
|
| 87 |
+
|
| 88 |
+
size = out.st_size
|
| 89 |
+
if link:
|
| 90 |
+
try:
|
| 91 |
+
out2 = path.stat(follow_symlinks=True)
|
| 92 |
+
size = out2.st_size
|
| 93 |
+
except OSError:
|
| 94 |
+
size = 0
|
| 95 |
+
path = self._strip_protocol(path.path)
|
| 96 |
+
else:
|
| 97 |
+
# str or path-like
|
| 98 |
+
path = self._strip_protocol(path)
|
| 99 |
+
out = os.stat(path, follow_symlinks=False)
|
| 100 |
+
link = stat.S_ISLNK(out.st_mode)
|
| 101 |
+
if link:
|
| 102 |
+
out = os.stat(path, follow_symlinks=True)
|
| 103 |
+
size = out.st_size
|
| 104 |
+
if stat.S_ISDIR(out.st_mode):
|
| 105 |
+
t = "directory"
|
| 106 |
+
elif stat.S_ISREG(out.st_mode):
|
| 107 |
+
t = "file"
|
| 108 |
+
else:
|
| 109 |
+
t = "other"
|
| 110 |
+
result = {
|
| 111 |
+
"name": path,
|
| 112 |
+
"size": size,
|
| 113 |
+
"type": t,
|
| 114 |
+
"created": out.st_ctime,
|
| 115 |
+
"islink": link,
|
| 116 |
+
}
|
| 117 |
+
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
| 118 |
+
result[field] = getattr(out, f"st_{field}")
|
| 119 |
+
if link:
|
| 120 |
+
result["destination"] = os.readlink(path)
|
| 121 |
+
return result
|
| 122 |
+
|
| 123 |
+
def lexists(self, path, **kwargs):
|
| 124 |
+
return osp.lexists(path)
|
| 125 |
+
|
| 126 |
+
def cp_file(self, path1, path2, **kwargs):
|
| 127 |
+
path1 = self._strip_protocol(path1)
|
| 128 |
+
path2 = self._strip_protocol(path2)
|
| 129 |
+
if self.auto_mkdir:
|
| 130 |
+
self.makedirs(self._parent(path2), exist_ok=True)
|
| 131 |
+
if self.isfile(path1):
|
| 132 |
+
shutil.copyfile(path1, path2)
|
| 133 |
+
elif self.isdir(path1):
|
| 134 |
+
self.mkdirs(path2, exist_ok=True)
|
| 135 |
+
else:
|
| 136 |
+
raise FileNotFoundError(path1)
|
| 137 |
+
|
| 138 |
+
def isfile(self, path):
|
| 139 |
+
path = self._strip_protocol(path)
|
| 140 |
+
return os.path.isfile(path)
|
| 141 |
+
|
| 142 |
+
def isdir(self, path):
|
| 143 |
+
path = self._strip_protocol(path)
|
| 144 |
+
return os.path.isdir(path)
|
| 145 |
+
|
| 146 |
+
def get_file(self, path1, path2, callback=None, **kwargs):
|
| 147 |
+
if isfilelike(path2):
|
| 148 |
+
with open(path1, "rb") as f:
|
| 149 |
+
shutil.copyfileobj(f, path2)
|
| 150 |
+
else:
|
| 151 |
+
return self.cp_file(path1, path2, **kwargs)
|
| 152 |
+
|
| 153 |
+
def put_file(self, path1, path2, callback=None, **kwargs):
|
| 154 |
+
return self.cp_file(path1, path2, **kwargs)
|
| 155 |
+
|
| 156 |
+
def mv(self, path1, path2, **kwargs):
|
| 157 |
+
path1 = self._strip_protocol(path1)
|
| 158 |
+
path2 = self._strip_protocol(path2)
|
| 159 |
+
shutil.move(path1, path2)
|
| 160 |
+
|
| 161 |
+
def link(self, src, dst, **kwargs):
|
| 162 |
+
src = self._strip_protocol(src)
|
| 163 |
+
dst = self._strip_protocol(dst)
|
| 164 |
+
os.link(src, dst, **kwargs)
|
| 165 |
+
|
| 166 |
+
def symlink(self, src, dst, **kwargs):
|
| 167 |
+
src = self._strip_protocol(src)
|
| 168 |
+
dst = self._strip_protocol(dst)
|
| 169 |
+
os.symlink(src, dst, **kwargs)
|
| 170 |
+
|
| 171 |
+
def islink(self, path) -> bool:
|
| 172 |
+
return os.path.islink(self._strip_protocol(path))
|
| 173 |
+
|
| 174 |
+
def rm_file(self, path):
|
| 175 |
+
os.remove(self._strip_protocol(path))
|
| 176 |
+
|
| 177 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
| 178 |
+
if not isinstance(path, list):
|
| 179 |
+
path = [path]
|
| 180 |
+
|
| 181 |
+
for p in path:
|
| 182 |
+
p = self._strip_protocol(p)
|
| 183 |
+
if self.isdir(p):
|
| 184 |
+
if not recursive:
|
| 185 |
+
raise ValueError("Cannot delete directory, set recursive=True")
|
| 186 |
+
if osp.abspath(p) == os.getcwd():
|
| 187 |
+
raise ValueError("Cannot delete current working directory")
|
| 188 |
+
shutil.rmtree(p)
|
| 189 |
+
else:
|
| 190 |
+
os.remove(p)
|
| 191 |
+
|
| 192 |
+
def unstrip_protocol(self, name):
|
| 193 |
+
name = self._strip_protocol(name) # normalise for local/win/...
|
| 194 |
+
return f"file://{name}"
|
| 195 |
+
|
| 196 |
+
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
| 197 |
+
path = self._strip_protocol(path)
|
| 198 |
+
if self.auto_mkdir and "w" in mode:
|
| 199 |
+
self.makedirs(self._parent(path), exist_ok=True)
|
| 200 |
+
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
| 201 |
+
|
| 202 |
+
def touch(self, path, truncate=True, **kwargs):
|
| 203 |
+
path = self._strip_protocol(path)
|
| 204 |
+
if self.auto_mkdir:
|
| 205 |
+
self.makedirs(self._parent(path), exist_ok=True)
|
| 206 |
+
if self.exists(path):
|
| 207 |
+
os.utime(path, None)
|
| 208 |
+
else:
|
| 209 |
+
open(path, "a").close()
|
| 210 |
+
if truncate:
|
| 211 |
+
os.truncate(path, 0)
|
| 212 |
+
|
| 213 |
+
def created(self, path):
|
| 214 |
+
info = self.info(path=path)
|
| 215 |
+
return datetime.datetime.fromtimestamp(
|
| 216 |
+
info["created"], tz=datetime.timezone.utc
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
def modified(self, path):
|
| 220 |
+
info = self.info(path=path)
|
| 221 |
+
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
| 222 |
+
|
| 223 |
+
@classmethod
|
| 224 |
+
def _parent(cls, path):
|
| 225 |
+
path = cls._strip_protocol(path)
|
| 226 |
+
if os.sep == "/":
|
| 227 |
+
# posix native
|
| 228 |
+
return path.rsplit("/", 1)[0] or "/"
|
| 229 |
+
else:
|
| 230 |
+
# NT
|
| 231 |
+
path_ = path.rsplit("/", 1)[0]
|
| 232 |
+
if len(path_) <= 3:
|
| 233 |
+
if path_[1:2] == ":":
|
| 234 |
+
# nt root (something like c:/)
|
| 235 |
+
return path_[0] + ":/"
|
| 236 |
+
# More cases may be required here
|
| 237 |
+
return path_
|
| 238 |
+
|
| 239 |
+
@classmethod
|
| 240 |
+
def _strip_protocol(cls, path):
|
| 241 |
+
path = stringify_path(path)
|
| 242 |
+
if path.startswith("file://"):
|
| 243 |
+
path = path[7:]
|
| 244 |
+
elif path.startswith("file:"):
|
| 245 |
+
path = path[5:]
|
| 246 |
+
elif path.startswith("local://"):
|
| 247 |
+
path = path[8:]
|
| 248 |
+
elif path.startswith("local:"):
|
| 249 |
+
path = path[6:]
|
| 250 |
+
|
| 251 |
+
path = make_path_posix(path)
|
| 252 |
+
if os.sep != "/":
|
| 253 |
+
# This code-path is a stripped down version of
|
| 254 |
+
# > drive, path = ntpath.splitdrive(path)
|
| 255 |
+
if path[1:2] == ":":
|
| 256 |
+
# Absolute drive-letter path, e.g. X:\Windows
|
| 257 |
+
# Relative path with drive, e.g. X:Windows
|
| 258 |
+
drive, path = path[:2], path[2:]
|
| 259 |
+
elif path[:2] == "//":
|
| 260 |
+
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
|
| 261 |
+
# Device drives, e.g. \\.\device or \\?\device
|
| 262 |
+
if (index1 := path.find("/", 2)) == -1 or (
|
| 263 |
+
index2 := path.find("/", index1 + 1)
|
| 264 |
+
) == -1:
|
| 265 |
+
drive, path = path, ""
|
| 266 |
+
else:
|
| 267 |
+
drive, path = path[:index2], path[index2:]
|
| 268 |
+
else:
|
| 269 |
+
# Relative path, e.g. Windows
|
| 270 |
+
drive = ""
|
| 271 |
+
|
| 272 |
+
path = path.rstrip("/") or cls.root_marker
|
| 273 |
+
return drive + path
|
| 274 |
+
|
| 275 |
+
else:
|
| 276 |
+
return path.rstrip("/") or cls.root_marker
|
| 277 |
+
|
| 278 |
+
def _isfilestore(self):
|
| 279 |
+
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
| 280 |
+
# the original motivation. But we are a posix-like file system.
|
| 281 |
+
# See https://github.com/dask/dask/issues/5526
|
| 282 |
+
return True
|
| 283 |
+
|
| 284 |
+
def chmod(self, path, mode):
|
| 285 |
+
path = stringify_path(path)
|
| 286 |
+
return os.chmod(path, mode)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def make_path_posix(path):
|
| 290 |
+
"""Make path generic and absolute for current OS"""
|
| 291 |
+
if not isinstance(path, str):
|
| 292 |
+
if isinstance(path, (list, set, tuple)):
|
| 293 |
+
return type(path)(make_path_posix(p) for p in path)
|
| 294 |
+
else:
|
| 295 |
+
path = stringify_path(path)
|
| 296 |
+
if not isinstance(path, str):
|
| 297 |
+
raise TypeError(f"could not convert {path!r} to string")
|
| 298 |
+
if os.sep == "/":
|
| 299 |
+
# Native posix
|
| 300 |
+
if path.startswith("/"):
|
| 301 |
+
# most common fast case for posix
|
| 302 |
+
return path
|
| 303 |
+
elif path.startswith("~"):
|
| 304 |
+
return osp.expanduser(path)
|
| 305 |
+
elif path.startswith("./"):
|
| 306 |
+
path = path[2:]
|
| 307 |
+
elif path == ".":
|
| 308 |
+
path = ""
|
| 309 |
+
return f"{os.getcwd()}/{path}"
|
| 310 |
+
else:
|
| 311 |
+
# NT handling
|
| 312 |
+
if path[0:1] == "/" and path[2:3] == ":":
|
| 313 |
+
# path is like "/c:/local/path"
|
| 314 |
+
path = path[1:]
|
| 315 |
+
if path[1:2] == ":":
|
| 316 |
+
# windows full path like "C:\\local\\path"
|
| 317 |
+
if len(path) <= 3:
|
| 318 |
+
# nt root (something like c:/)
|
| 319 |
+
return path[0] + ":/"
|
| 320 |
+
path = path.replace("\\", "/")
|
| 321 |
+
return path
|
| 322 |
+
elif path[0:1] == "~":
|
| 323 |
+
return make_path_posix(osp.expanduser(path))
|
| 324 |
+
elif path.startswith(("\\\\", "//")):
|
| 325 |
+
# windows UNC/DFS-style paths
|
| 326 |
+
return "//" + path[2:].replace("\\", "/")
|
| 327 |
+
elif path.startswith(("\\", "/")):
|
| 328 |
+
# windows relative path with root
|
| 329 |
+
path = path.replace("\\", "/")
|
| 330 |
+
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
|
| 331 |
+
else:
|
| 332 |
+
path = path.replace("\\", "/")
|
| 333 |
+
if path.startswith("./"):
|
| 334 |
+
path = path[2:]
|
| 335 |
+
elif path == ".":
|
| 336 |
+
path = ""
|
| 337 |
+
return f"{make_path_posix(os.getcwd())}/{path}"
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def trailing_sep(path):
|
| 341 |
+
"""Return True if the path ends with a path separator.
|
| 342 |
+
|
| 343 |
+
A forward slash is always considered a path separator, even on Operating
|
| 344 |
+
Systems that normally use a backslash.
|
| 345 |
+
"""
|
| 346 |
+
# TODO: if all incoming paths were posix-compliant then separator would
|
| 347 |
+
# always be a forward slash, simplifying this function.
|
| 348 |
+
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
| 349 |
+
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
class LocalFileOpener(io.IOBase):
|
| 353 |
+
def __init__(
|
| 354 |
+
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
| 355 |
+
):
|
| 356 |
+
logger.debug("open file: %s", path)
|
| 357 |
+
self.path = path
|
| 358 |
+
self.mode = mode
|
| 359 |
+
self.fs = fs
|
| 360 |
+
self.f = None
|
| 361 |
+
self.autocommit = autocommit
|
| 362 |
+
self.compression = get_compression(path, compression)
|
| 363 |
+
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
| 364 |
+
self._open()
|
| 365 |
+
|
| 366 |
+
def _open(self):
|
| 367 |
+
if self.f is None or self.f.closed:
|
| 368 |
+
if self.autocommit or "w" not in self.mode:
|
| 369 |
+
self.f = open(self.path, mode=self.mode)
|
| 370 |
+
if self.compression:
|
| 371 |
+
compress = compr[self.compression]
|
| 372 |
+
self.f = compress(self.f, mode=self.mode)
|
| 373 |
+
else:
|
| 374 |
+
# TODO: check if path is writable?
|
| 375 |
+
i, name = tempfile.mkstemp()
|
| 376 |
+
os.close(i) # we want normal open and normal buffered file
|
| 377 |
+
self.temp = name
|
| 378 |
+
self.f = open(name, mode=self.mode)
|
| 379 |
+
if "w" not in self.mode:
|
| 380 |
+
self.size = self.f.seek(0, 2)
|
| 381 |
+
self.f.seek(0)
|
| 382 |
+
self.f.size = self.size
|
| 383 |
+
|
| 384 |
+
def _fetch_range(self, start, end):
|
| 385 |
+
# probably only used by cached FS
|
| 386 |
+
if "r" not in self.mode:
|
| 387 |
+
raise ValueError
|
| 388 |
+
self._open()
|
| 389 |
+
self.f.seek(start)
|
| 390 |
+
return self.f.read(end - start)
|
| 391 |
+
|
| 392 |
+
def __setstate__(self, state):
|
| 393 |
+
self.f = None
|
| 394 |
+
loc = state.pop("loc", None)
|
| 395 |
+
self.__dict__.update(state)
|
| 396 |
+
if "r" in state["mode"]:
|
| 397 |
+
self.f = None
|
| 398 |
+
self._open()
|
| 399 |
+
self.f.seek(loc)
|
| 400 |
+
|
| 401 |
+
def __getstate__(self):
|
| 402 |
+
d = self.__dict__.copy()
|
| 403 |
+
d.pop("f")
|
| 404 |
+
if "r" in self.mode:
|
| 405 |
+
d["loc"] = self.f.tell()
|
| 406 |
+
else:
|
| 407 |
+
if not self.f.closed:
|
| 408 |
+
raise ValueError("Cannot serialise open write-mode local file")
|
| 409 |
+
return d
|
| 410 |
+
|
| 411 |
+
def commit(self):
|
| 412 |
+
if self.autocommit:
|
| 413 |
+
raise RuntimeError("Can only commit if not already set to autocommit")
|
| 414 |
+
shutil.move(self.temp, self.path)
|
| 415 |
+
|
| 416 |
+
def discard(self):
|
| 417 |
+
if self.autocommit:
|
| 418 |
+
raise RuntimeError("Cannot discard if set to autocommit")
|
| 419 |
+
os.remove(self.temp)
|
| 420 |
+
|
| 421 |
+
def readable(self) -> bool:
|
| 422 |
+
return True
|
| 423 |
+
|
| 424 |
+
def writable(self) -> bool:
|
| 425 |
+
return "r" not in self.mode
|
| 426 |
+
|
| 427 |
+
def read(self, *args, **kwargs):
|
| 428 |
+
return self.f.read(*args, **kwargs)
|
| 429 |
+
|
| 430 |
+
def write(self, *args, **kwargs):
|
| 431 |
+
return self.f.write(*args, **kwargs)
|
| 432 |
+
|
| 433 |
+
def tell(self, *args, **kwargs):
|
| 434 |
+
return self.f.tell(*args, **kwargs)
|
| 435 |
+
|
| 436 |
+
def seek(self, *args, **kwargs):
|
| 437 |
+
return self.f.seek(*args, **kwargs)
|
| 438 |
+
|
| 439 |
+
def seekable(self, *args, **kwargs):
|
| 440 |
+
return self.f.seekable(*args, **kwargs)
|
| 441 |
+
|
| 442 |
+
def readline(self, *args, **kwargs):
|
| 443 |
+
return self.f.readline(*args, **kwargs)
|
| 444 |
+
|
| 445 |
+
def readlines(self, *args, **kwargs):
|
| 446 |
+
return self.f.readlines(*args, **kwargs)
|
| 447 |
+
|
| 448 |
+
def close(self):
|
| 449 |
+
return self.f.close()
|
| 450 |
+
|
| 451 |
+
def truncate(self, size=None) -> int:
|
| 452 |
+
return self.f.truncate(size)
|
| 453 |
+
|
| 454 |
+
@property
|
| 455 |
+
def closed(self):
|
| 456 |
+
return self.f.closed
|
| 457 |
+
|
| 458 |
+
def fileno(self):
|
| 459 |
+
return self.raw.fileno()
|
| 460 |
+
|
| 461 |
+
def flush(self) -> None:
|
| 462 |
+
self.f.flush()
|
| 463 |
+
|
| 464 |
+
def __iter__(self):
|
| 465 |
+
return self.f.__iter__()
|
| 466 |
+
|
| 467 |
+
def __getattr__(self, item):
|
| 468 |
+
return getattr(self.f, item)
|
| 469 |
+
|
| 470 |
+
def __enter__(self):
|
| 471 |
+
self._incontext = True
|
| 472 |
+
return self
|
| 473 |
+
|
| 474 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
| 475 |
+
self._incontext = False
|
| 476 |
+
self.f.__exit__(exc_type, exc_value, traceback)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py
ADDED
|
@@ -0,0 +1,1306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import collections
|
| 3 |
+
import io
|
| 4 |
+
import itertools
|
| 5 |
+
import logging
|
| 6 |
+
import math
|
| 7 |
+
import os
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from itertools import chain
|
| 10 |
+
from typing import TYPE_CHECKING, Literal
|
| 11 |
+
|
| 12 |
+
import fsspec.core
|
| 13 |
+
from fsspec.spec import AbstractBufferedFile
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import ujson as json
|
| 17 |
+
except ImportError:
|
| 18 |
+
if not TYPE_CHECKING:
|
| 19 |
+
import json
|
| 20 |
+
|
| 21 |
+
from fsspec.asyn import AsyncFileSystem
|
| 22 |
+
from fsspec.callbacks import DEFAULT_CALLBACK
|
| 23 |
+
from fsspec.core import filesystem, open, split_protocol
|
| 24 |
+
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
| 25 |
+
from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger("fsspec.reference")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ReferenceNotReachable(RuntimeError):
|
| 31 |
+
def __init__(self, reference, target, *args):
|
| 32 |
+
super().__init__(*args)
|
| 33 |
+
self.reference = reference
|
| 34 |
+
self.target = target
|
| 35 |
+
|
| 36 |
+
def __str__(self):
|
| 37 |
+
return f'Reference "{self.reference}" failed to fetch target {self.target}'
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _first(d):
|
| 41 |
+
return next(iter(d.values()))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _prot_in_references(path, references):
|
| 45 |
+
ref = references.get(path)
|
| 46 |
+
if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
|
| 47 |
+
return split_protocol(ref[0])[0] if ref[0] else ref[0]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _protocol_groups(paths, references):
|
| 51 |
+
if isinstance(paths, str):
|
| 52 |
+
return {_prot_in_references(paths, references): [paths]}
|
| 53 |
+
out = {}
|
| 54 |
+
for path in paths:
|
| 55 |
+
protocol = _prot_in_references(path, references)
|
| 56 |
+
out.setdefault(protocol, []).append(path)
|
| 57 |
+
return out
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class RefsValuesView(collections.abc.ValuesView):
|
| 61 |
+
def __iter__(self):
|
| 62 |
+
for val in self._mapping.zmetadata.values():
|
| 63 |
+
yield json.dumps(val).encode()
|
| 64 |
+
yield from self._mapping._items.values()
|
| 65 |
+
for field in self._mapping.listdir():
|
| 66 |
+
chunk_sizes = self._mapping._get_chunk_sizes(field)
|
| 67 |
+
if len(chunk_sizes) == 0:
|
| 68 |
+
yield self._mapping[field + "/0"]
|
| 69 |
+
continue
|
| 70 |
+
yield from self._mapping._generate_all_records(field)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class RefsItemsView(collections.abc.ItemsView):
|
| 74 |
+
def __iter__(self):
|
| 75 |
+
return zip(self._mapping.keys(), self._mapping.values())
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def ravel_multi_index(idx, sizes):
|
| 79 |
+
val = 0
|
| 80 |
+
mult = 1
|
| 81 |
+
for i, s in zip(idx[::-1], sizes[::-1]):
|
| 82 |
+
val += i * mult
|
| 83 |
+
mult *= s
|
| 84 |
+
return val
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class LazyReferenceMapper(collections.abc.MutableMapping):
|
| 88 |
+
"""This interface can be used to read/write references from Parquet stores.
|
| 89 |
+
It is not intended for other types of references.
|
| 90 |
+
It can be used with Kerchunk's MultiZarrToZarr method to combine
|
| 91 |
+
references into a parquet store.
|
| 92 |
+
Examples of this use-case can be found here:
|
| 93 |
+
https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
|
| 94 |
+
|
| 95 |
+
# import is class level to prevent numpy dep requirement for fsspec
|
| 96 |
+
@property
|
| 97 |
+
def np(self):
|
| 98 |
+
import numpy as np
|
| 99 |
+
|
| 100 |
+
return np
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def pd(self):
|
| 104 |
+
import pandas as pd
|
| 105 |
+
|
| 106 |
+
return pd
|
| 107 |
+
|
| 108 |
+
def __init__(
|
| 109 |
+
self,
|
| 110 |
+
root,
|
| 111 |
+
fs=None,
|
| 112 |
+
out_root=None,
|
| 113 |
+
cache_size=128,
|
| 114 |
+
categorical_threshold=10,
|
| 115 |
+
engine: Literal["fastparquet", "pyarrow"] = "fastparquet",
|
| 116 |
+
):
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
This instance will be writable, storing changes in memory until full partitions
|
| 120 |
+
are accumulated or .flush() is called.
|
| 121 |
+
|
| 122 |
+
To create an empty lazy store, use .create()
|
| 123 |
+
|
| 124 |
+
Parameters
|
| 125 |
+
----------
|
| 126 |
+
root : str
|
| 127 |
+
Root of parquet store
|
| 128 |
+
fs : fsspec.AbstractFileSystem
|
| 129 |
+
fsspec filesystem object, default is local filesystem.
|
| 130 |
+
cache_size : int, default=128
|
| 131 |
+
Maximum size of LRU cache, where cache_size*record_size denotes
|
| 132 |
+
the total number of references that can be loaded in memory at once.
|
| 133 |
+
categorical_threshold : int
|
| 134 |
+
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
|
| 135 |
+
of the number of unique urls to total number of refs for each variable
|
| 136 |
+
is greater than or equal to this number. (default 10)
|
| 137 |
+
engine: Literal["fastparquet","pyarrow"]
|
| 138 |
+
Engine choice for reading parquet files. (default is "fastparquet")
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
self.root = root
|
| 142 |
+
self.chunk_sizes = {}
|
| 143 |
+
self.out_root = out_root or self.root
|
| 144 |
+
self.cat_thresh = categorical_threshold
|
| 145 |
+
self.engine = engine
|
| 146 |
+
self.cache_size = cache_size
|
| 147 |
+
self.url = self.root + "/{field}/refs.{record}.parq"
|
| 148 |
+
# TODO: derive fs from `root`
|
| 149 |
+
self.fs = fsspec.filesystem("file") if fs is None else fs
|
| 150 |
+
|
| 151 |
+
from importlib.util import find_spec
|
| 152 |
+
|
| 153 |
+
if self.engine == "pyarrow" and find_spec("pyarrow") is None:
|
| 154 |
+
raise ImportError("engine choice `pyarrow` is not installed.")
|
| 155 |
+
|
| 156 |
+
def __getattr__(self, item):
|
| 157 |
+
if item in ("_items", "record_size", "zmetadata"):
|
| 158 |
+
self.setup()
|
| 159 |
+
# avoid possible recursion if setup fails somehow
|
| 160 |
+
return self.__dict__[item]
|
| 161 |
+
raise AttributeError(item)
|
| 162 |
+
|
| 163 |
+
def setup(self):
|
| 164 |
+
self._items = {}
|
| 165 |
+
self._items[".zmetadata"] = self.fs.cat_file(
|
| 166 |
+
"/".join([self.root, ".zmetadata"])
|
| 167 |
+
)
|
| 168 |
+
met = json.loads(self._items[".zmetadata"])
|
| 169 |
+
self.record_size = met["record_size"]
|
| 170 |
+
self.zmetadata = met["metadata"]
|
| 171 |
+
|
| 172 |
+
# Define function to open and decompress refs
|
| 173 |
+
@lru_cache(maxsize=self.cache_size)
|
| 174 |
+
def open_refs(field, record):
|
| 175 |
+
"""cached parquet file loader"""
|
| 176 |
+
path = self.url.format(field=field, record=record)
|
| 177 |
+
data = io.BytesIO(self.fs.cat_file(path))
|
| 178 |
+
try:
|
| 179 |
+
df = self.pd.read_parquet(data, engine=self.engine)
|
| 180 |
+
refs = {c: df[c].to_numpy() for c in df.columns}
|
| 181 |
+
except OSError:
|
| 182 |
+
refs = None
|
| 183 |
+
return refs
|
| 184 |
+
|
| 185 |
+
self.open_refs = open_refs
|
| 186 |
+
|
| 187 |
+
@staticmethod
|
| 188 |
+
def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
|
| 189 |
+
"""Make empty parquet reference set
|
| 190 |
+
|
| 191 |
+
First deletes the contents of the given directory, if it exists.
|
| 192 |
+
|
| 193 |
+
Parameters
|
| 194 |
+
----------
|
| 195 |
+
root: str
|
| 196 |
+
Directory to contain the output; will be created
|
| 197 |
+
storage_options: dict | None
|
| 198 |
+
For making the filesystem to use for writing is fs is None
|
| 199 |
+
fs: FileSystem | None
|
| 200 |
+
Filesystem for writing
|
| 201 |
+
record_size: int
|
| 202 |
+
Number of references per parquet file
|
| 203 |
+
kwargs: passed to __init__
|
| 204 |
+
|
| 205 |
+
Returns
|
| 206 |
+
-------
|
| 207 |
+
LazyReferenceMapper instance
|
| 208 |
+
"""
|
| 209 |
+
met = {"metadata": {}, "record_size": record_size}
|
| 210 |
+
if fs is None:
|
| 211 |
+
fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
|
| 212 |
+
if fs.exists(root):
|
| 213 |
+
fs.rm(root, recursive=True)
|
| 214 |
+
fs.makedirs(root, exist_ok=True)
|
| 215 |
+
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
| 216 |
+
return LazyReferenceMapper(root, fs, **kwargs)
|
| 217 |
+
|
| 218 |
+
@lru_cache()
|
| 219 |
+
def listdir(self):
|
| 220 |
+
"""List top-level directories"""
|
| 221 |
+
dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
|
| 222 |
+
return set(dirs)
|
| 223 |
+
|
| 224 |
+
def ls(self, path="", detail=True):
|
| 225 |
+
"""Shortcut file listings"""
|
| 226 |
+
path = path.rstrip("/")
|
| 227 |
+
pathdash = path + "/" if path else ""
|
| 228 |
+
dirnames = self.listdir()
|
| 229 |
+
dirs = [
|
| 230 |
+
d
|
| 231 |
+
for d in dirnames
|
| 232 |
+
if d.startswith(pathdash) and "/" not in d.lstrip(pathdash)
|
| 233 |
+
]
|
| 234 |
+
if dirs:
|
| 235 |
+
others = {
|
| 236 |
+
f
|
| 237 |
+
for f in chain(
|
| 238 |
+
[".zmetadata"],
|
| 239 |
+
(name for name in self.zmetadata),
|
| 240 |
+
(name for name in self._items),
|
| 241 |
+
)
|
| 242 |
+
if f.startswith(pathdash) and "/" not in f.lstrip(pathdash)
|
| 243 |
+
}
|
| 244 |
+
if detail is False:
|
| 245 |
+
others.update(dirs)
|
| 246 |
+
return sorted(others)
|
| 247 |
+
dirinfo = [{"name": name, "type": "directory", "size": 0} for name in dirs]
|
| 248 |
+
fileinfo = [
|
| 249 |
+
{
|
| 250 |
+
"name": name,
|
| 251 |
+
"type": "file",
|
| 252 |
+
"size": len(
|
| 253 |
+
json.dumps(self.zmetadata[name])
|
| 254 |
+
if name in self.zmetadata
|
| 255 |
+
else self._items[name]
|
| 256 |
+
),
|
| 257 |
+
}
|
| 258 |
+
for name in others
|
| 259 |
+
]
|
| 260 |
+
return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
|
| 261 |
+
field = path
|
| 262 |
+
others = set(
|
| 263 |
+
[name for name in self.zmetadata if name.startswith(f"{path}/")]
|
| 264 |
+
+ [name for name in self._items if name.startswith(f"{path}/")]
|
| 265 |
+
)
|
| 266 |
+
fileinfo = [
|
| 267 |
+
{
|
| 268 |
+
"name": name,
|
| 269 |
+
"type": "file",
|
| 270 |
+
"size": len(
|
| 271 |
+
json.dumps(self.zmetadata[name])
|
| 272 |
+
if name in self.zmetadata
|
| 273 |
+
else self._items[name]
|
| 274 |
+
),
|
| 275 |
+
}
|
| 276 |
+
for name in others
|
| 277 |
+
]
|
| 278 |
+
keys = self._keys_in_field(field)
|
| 279 |
+
|
| 280 |
+
if detail is False:
|
| 281 |
+
return list(others) + list(keys)
|
| 282 |
+
recs = self._generate_all_records(field)
|
| 283 |
+
recinfo = [
|
| 284 |
+
{"name": name, "type": "file", "size": rec[-1]}
|
| 285 |
+
for name, rec in zip(keys, recs)
|
| 286 |
+
if rec[0] # filters out path==None, deleted/missing
|
| 287 |
+
]
|
| 288 |
+
return fileinfo + recinfo
|
| 289 |
+
|
| 290 |
+
def _load_one_key(self, key):
|
| 291 |
+
"""Get the reference for one key
|
| 292 |
+
|
| 293 |
+
Returns bytes, one-element list or three-element list.
|
| 294 |
+
"""
|
| 295 |
+
if key in self._items:
|
| 296 |
+
return self._items[key]
|
| 297 |
+
elif key in self.zmetadata:
|
| 298 |
+
return json.dumps(self.zmetadata[key]).encode()
|
| 299 |
+
elif "/" not in key or self._is_meta(key):
|
| 300 |
+
raise KeyError(key)
|
| 301 |
+
field, _ = key.rsplit("/", 1)
|
| 302 |
+
record, ri, chunk_size = self._key_to_record(key)
|
| 303 |
+
maybe = self._items.get((field, record), {}).get(ri, False)
|
| 304 |
+
if maybe is None:
|
| 305 |
+
# explicitly deleted
|
| 306 |
+
raise KeyError
|
| 307 |
+
elif maybe:
|
| 308 |
+
return maybe
|
| 309 |
+
elif chunk_size == 0:
|
| 310 |
+
return b""
|
| 311 |
+
|
| 312 |
+
# Chunk keys can be loaded from row group and cached in LRU cache
|
| 313 |
+
try:
|
| 314 |
+
refs = self.open_refs(field, record)
|
| 315 |
+
except (ValueError, TypeError, FileNotFoundError) as exc:
|
| 316 |
+
raise KeyError(key) from exc
|
| 317 |
+
columns = ["path", "offset", "size", "raw"]
|
| 318 |
+
selection = [refs[c][ri] if c in refs else None for c in columns]
|
| 319 |
+
raw = selection[-1]
|
| 320 |
+
if raw is not None:
|
| 321 |
+
return raw
|
| 322 |
+
if selection[0] is None:
|
| 323 |
+
raise KeyError("This reference does not exist or has been deleted")
|
| 324 |
+
if selection[1:3] == [0, 0]:
|
| 325 |
+
# URL only
|
| 326 |
+
return selection[:1]
|
| 327 |
+
# URL, offset, size
|
| 328 |
+
return selection[:3]
|
| 329 |
+
|
| 330 |
+
@lru_cache(4096)
|
| 331 |
+
def _key_to_record(self, key):
|
| 332 |
+
"""Details needed to construct a reference for one key"""
|
| 333 |
+
field, chunk = key.rsplit("/", 1)
|
| 334 |
+
chunk_sizes = self._get_chunk_sizes(field)
|
| 335 |
+
if len(chunk_sizes) == 0:
|
| 336 |
+
return 0, 0, 0
|
| 337 |
+
chunk_idx = [int(c) for c in chunk.split(".")]
|
| 338 |
+
chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
|
| 339 |
+
record = chunk_number // self.record_size
|
| 340 |
+
ri = chunk_number % self.record_size
|
| 341 |
+
return record, ri, len(chunk_sizes)
|
| 342 |
+
|
| 343 |
+
def _get_chunk_sizes(self, field):
|
| 344 |
+
"""The number of chunks along each axis for a given field"""
|
| 345 |
+
if field not in self.chunk_sizes:
|
| 346 |
+
zarray = self.zmetadata[f"{field}/.zarray"]
|
| 347 |
+
size_ratio = [
|
| 348 |
+
math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
|
| 349 |
+
]
|
| 350 |
+
self.chunk_sizes[field] = size_ratio or [1]
|
| 351 |
+
return self.chunk_sizes[field]
|
| 352 |
+
|
| 353 |
+
def _generate_record(self, field, record):
|
| 354 |
+
"""The references for a given parquet file of a given field"""
|
| 355 |
+
refs = self.open_refs(field, record)
|
| 356 |
+
it = iter(zip(*refs.values()))
|
| 357 |
+
if len(refs) == 3:
|
| 358 |
+
# All urls
|
| 359 |
+
return (list(t) for t in it)
|
| 360 |
+
elif len(refs) == 1:
|
| 361 |
+
# All raws
|
| 362 |
+
return refs["raw"]
|
| 363 |
+
else:
|
| 364 |
+
# Mix of urls and raws
|
| 365 |
+
return (list(t[:3]) if not t[3] else t[3] for t in it)
|
| 366 |
+
|
| 367 |
+
def _generate_all_records(self, field):
|
| 368 |
+
"""Load all the references within a field by iterating over the parquet files"""
|
| 369 |
+
nrec = 1
|
| 370 |
+
for ch in self._get_chunk_sizes(field):
|
| 371 |
+
nrec *= ch
|
| 372 |
+
nrec = math.ceil(nrec / self.record_size)
|
| 373 |
+
for record in range(nrec):
|
| 374 |
+
yield from self._generate_record(field, record)
|
| 375 |
+
|
| 376 |
+
def values(self):
|
| 377 |
+
return RefsValuesView(self)
|
| 378 |
+
|
| 379 |
+
def items(self):
|
| 380 |
+
return RefsItemsView(self)
|
| 381 |
+
|
| 382 |
+
def __hash__(self):
|
| 383 |
+
return id(self)
|
| 384 |
+
|
| 385 |
+
def __getitem__(self, key):
|
| 386 |
+
return self._load_one_key(key)
|
| 387 |
+
|
| 388 |
+
def __setitem__(self, key, value):
|
| 389 |
+
if "/" in key and not self._is_meta(key):
|
| 390 |
+
field, chunk = key.rsplit("/", 1)
|
| 391 |
+
record, i, _ = self._key_to_record(key)
|
| 392 |
+
subdict = self._items.setdefault((field, record), {})
|
| 393 |
+
subdict[i] = value
|
| 394 |
+
if len(subdict) == self.record_size:
|
| 395 |
+
self.write(field, record)
|
| 396 |
+
else:
|
| 397 |
+
# metadata or top-level
|
| 398 |
+
if hasattr(value, "to_bytes"):
|
| 399 |
+
val = value.to_bytes().decode()
|
| 400 |
+
elif isinstance(value, bytes):
|
| 401 |
+
val = value.decode()
|
| 402 |
+
else:
|
| 403 |
+
val = value
|
| 404 |
+
self._items[key] = val
|
| 405 |
+
new_value = json.loads(val)
|
| 406 |
+
self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
|
| 407 |
+
|
| 408 |
+
@staticmethod
|
| 409 |
+
def _is_meta(key):
|
| 410 |
+
return key.startswith(".z") or "/.z" in key
|
| 411 |
+
|
| 412 |
+
def __delitem__(self, key):
|
| 413 |
+
if key in self._items:
|
| 414 |
+
del self._items[key]
|
| 415 |
+
elif key in self.zmetadata:
|
| 416 |
+
del self.zmetadata[key]
|
| 417 |
+
else:
|
| 418 |
+
if "/" in key and not self._is_meta(key):
|
| 419 |
+
field, _ = key.rsplit("/", 1)
|
| 420 |
+
record, i, _ = self._key_to_record(key)
|
| 421 |
+
subdict = self._items.setdefault((field, record), {})
|
| 422 |
+
subdict[i] = None
|
| 423 |
+
if len(subdict) == self.record_size:
|
| 424 |
+
self.write(field, record)
|
| 425 |
+
else:
|
| 426 |
+
# metadata or top-level
|
| 427 |
+
self._items[key] = None
|
| 428 |
+
|
| 429 |
+
def write(self, field, record, base_url=None, storage_options=None):
|
| 430 |
+
# extra requirements if writing
|
| 431 |
+
import kerchunk.df
|
| 432 |
+
import numpy as np
|
| 433 |
+
import pandas as pd
|
| 434 |
+
|
| 435 |
+
partition = self._items[(field, record)]
|
| 436 |
+
original = False
|
| 437 |
+
if len(partition) < self.record_size:
|
| 438 |
+
try:
|
| 439 |
+
original = self.open_refs(field, record)
|
| 440 |
+
except OSError:
|
| 441 |
+
pass
|
| 442 |
+
|
| 443 |
+
if original:
|
| 444 |
+
paths = original["path"]
|
| 445 |
+
offsets = original["offset"]
|
| 446 |
+
sizes = original["size"]
|
| 447 |
+
raws = original["raw"]
|
| 448 |
+
else:
|
| 449 |
+
paths = np.full(self.record_size, np.nan, dtype="O")
|
| 450 |
+
offsets = np.zeros(self.record_size, dtype="int64")
|
| 451 |
+
sizes = np.zeros(self.record_size, dtype="int64")
|
| 452 |
+
raws = np.full(self.record_size, np.nan, dtype="O")
|
| 453 |
+
for j, data in partition.items():
|
| 454 |
+
if isinstance(data, list):
|
| 455 |
+
if (
|
| 456 |
+
str(paths.dtype) == "category"
|
| 457 |
+
and data[0] not in paths.dtype.categories
|
| 458 |
+
):
|
| 459 |
+
paths = paths.add_categories(data[0])
|
| 460 |
+
paths[j] = data[0]
|
| 461 |
+
if len(data) > 1:
|
| 462 |
+
offsets[j] = data[1]
|
| 463 |
+
sizes[j] = data[2]
|
| 464 |
+
elif data is None:
|
| 465 |
+
# delete
|
| 466 |
+
paths[j] = None
|
| 467 |
+
offsets[j] = 0
|
| 468 |
+
sizes[j] = 0
|
| 469 |
+
raws[j] = None
|
| 470 |
+
else:
|
| 471 |
+
# this is the only call into kerchunk, could remove
|
| 472 |
+
raws[j] = kerchunk.df._proc_raw(data)
|
| 473 |
+
# TODO: only save needed columns
|
| 474 |
+
df = pd.DataFrame(
|
| 475 |
+
{
|
| 476 |
+
"path": paths,
|
| 477 |
+
"offset": offsets,
|
| 478 |
+
"size": sizes,
|
| 479 |
+
"raw": raws,
|
| 480 |
+
},
|
| 481 |
+
copy=False,
|
| 482 |
+
)
|
| 483 |
+
if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
|
| 484 |
+
df["path"] = df["path"].astype("category")
|
| 485 |
+
object_encoding = {"raw": "bytes", "path": "utf8"}
|
| 486 |
+
has_nulls = ["path", "raw"]
|
| 487 |
+
|
| 488 |
+
fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
|
| 489 |
+
self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
|
| 490 |
+
|
| 491 |
+
if self.engine == "pyarrow":
|
| 492 |
+
df_backend_kwargs = {"write_statistics": False}
|
| 493 |
+
elif self.engine == "fastparquet":
|
| 494 |
+
df_backend_kwargs = {
|
| 495 |
+
"stats": False,
|
| 496 |
+
"object_encoding": object_encoding,
|
| 497 |
+
"has_nulls": has_nulls,
|
| 498 |
+
}
|
| 499 |
+
else:
|
| 500 |
+
raise NotImplementedError(f"{self.engine} not supported")
|
| 501 |
+
|
| 502 |
+
df.to_parquet(
|
| 503 |
+
fn,
|
| 504 |
+
engine=self.engine,
|
| 505 |
+
storage_options=storage_options
|
| 506 |
+
or getattr(self.fs, "storage_options", None),
|
| 507 |
+
compression="zstd",
|
| 508 |
+
index=False,
|
| 509 |
+
**df_backend_kwargs,
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
partition.clear()
|
| 513 |
+
self._items.pop((field, record))
|
| 514 |
+
|
| 515 |
+
def flush(self, base_url=None, storage_options=None):
|
| 516 |
+
"""Output any modified or deleted keys
|
| 517 |
+
|
| 518 |
+
Parameters
|
| 519 |
+
----------
|
| 520 |
+
base_url: str
|
| 521 |
+
Location of the output
|
| 522 |
+
"""
|
| 523 |
+
|
| 524 |
+
# write what we have so far and clear sub chunks
|
| 525 |
+
for thing in list(self._items):
|
| 526 |
+
if isinstance(thing, tuple):
|
| 527 |
+
field, record = thing
|
| 528 |
+
self.write(
|
| 529 |
+
field,
|
| 530 |
+
record,
|
| 531 |
+
base_url=base_url,
|
| 532 |
+
storage_options=storage_options,
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
# gather .zmetadata from self._items and write that too
|
| 536 |
+
for k in list(self._items):
|
| 537 |
+
if k != ".zmetadata" and ".z" in k:
|
| 538 |
+
self.zmetadata[k] = json.loads(self._items.pop(k))
|
| 539 |
+
met = {"metadata": self.zmetadata, "record_size": self.record_size}
|
| 540 |
+
self._items.clear()
|
| 541 |
+
self._items[".zmetadata"] = json.dumps(met).encode()
|
| 542 |
+
self.fs.pipe(
|
| 543 |
+
"/".join([base_url or self.out_root, ".zmetadata"]),
|
| 544 |
+
self._items[".zmetadata"],
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
# TODO: only clear those that we wrote to?
|
| 548 |
+
self.open_refs.cache_clear()
|
| 549 |
+
|
| 550 |
+
def __len__(self):
|
| 551 |
+
# Caveat: This counts expected references, not actual - but is fast
|
| 552 |
+
count = 0
|
| 553 |
+
for field in self.listdir():
|
| 554 |
+
if field.startswith("."):
|
| 555 |
+
count += 1
|
| 556 |
+
else:
|
| 557 |
+
count += math.prod(self._get_chunk_sizes(field))
|
| 558 |
+
count += len(self.zmetadata) # all metadata keys
|
| 559 |
+
# any other files not in reference partitions
|
| 560 |
+
count += sum(1 for _ in self._items if not isinstance(_, tuple))
|
| 561 |
+
return count
|
| 562 |
+
|
| 563 |
+
def __iter__(self):
|
| 564 |
+
# Caveat: returns only existing keys, so the number of these does not
|
| 565 |
+
# match len(self)
|
| 566 |
+
metas = set(self.zmetadata)
|
| 567 |
+
metas.update(self._items)
|
| 568 |
+
for bit in metas:
|
| 569 |
+
if isinstance(bit, str):
|
| 570 |
+
yield bit
|
| 571 |
+
for field in self.listdir():
|
| 572 |
+
for k in self._keys_in_field(field):
|
| 573 |
+
if k in self:
|
| 574 |
+
yield k
|
| 575 |
+
|
| 576 |
+
def __contains__(self, item):
|
| 577 |
+
try:
|
| 578 |
+
self._load_one_key(item)
|
| 579 |
+
return True
|
| 580 |
+
except KeyError:
|
| 581 |
+
return False
|
| 582 |
+
|
| 583 |
+
def _keys_in_field(self, field):
|
| 584 |
+
"""List key names in given field
|
| 585 |
+
|
| 586 |
+
Produces strings like "field/x.y" appropriate from the chunking of the array
|
| 587 |
+
"""
|
| 588 |
+
chunk_sizes = self._get_chunk_sizes(field)
|
| 589 |
+
if len(chunk_sizes) == 0:
|
| 590 |
+
yield field + "/0"
|
| 591 |
+
return
|
| 592 |
+
inds = itertools.product(*(range(i) for i in chunk_sizes))
|
| 593 |
+
for ind in inds:
|
| 594 |
+
yield field + "/" + ".".join([str(c) for c in ind])
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
class ReferenceFileSystem(AsyncFileSystem):
|
| 598 |
+
"""View byte ranges of some other file as a file system
|
| 599 |
+
Initial version: single file system target, which must support
|
| 600 |
+
async, and must allow start and end args in _cat_file. Later versions
|
| 601 |
+
may allow multiple arbitrary URLs for the targets.
|
| 602 |
+
This FileSystem is read-only. It is designed to be used with async
|
| 603 |
+
targets (for now). We do not get original file details from the target FS.
|
| 604 |
+
Configuration is by passing a dict of references at init, or a URL to
|
| 605 |
+
a JSON file containing the same; this dict
|
| 606 |
+
can also contain concrete data for some set of paths.
|
| 607 |
+
Reference dict format:
|
| 608 |
+
{path0: bytes_data, path1: (target_url, offset, size)}
|
| 609 |
+
https://github.com/fsspec/kerchunk/blob/main/README.md
|
| 610 |
+
"""
|
| 611 |
+
|
| 612 |
+
protocol = "reference"
|
| 613 |
+
cachable = False
|
| 614 |
+
|
| 615 |
+
def __init__(
|
| 616 |
+
self,
|
| 617 |
+
fo,
|
| 618 |
+
target=None,
|
| 619 |
+
ref_storage_args=None,
|
| 620 |
+
target_protocol=None,
|
| 621 |
+
target_options=None,
|
| 622 |
+
remote_protocol=None,
|
| 623 |
+
remote_options=None,
|
| 624 |
+
fs=None,
|
| 625 |
+
template_overrides=None,
|
| 626 |
+
simple_templates=True,
|
| 627 |
+
max_gap=64_000,
|
| 628 |
+
max_block=256_000_000,
|
| 629 |
+
cache_size=128,
|
| 630 |
+
**kwargs,
|
| 631 |
+
):
|
| 632 |
+
"""
|
| 633 |
+
Parameters
|
| 634 |
+
----------
|
| 635 |
+
fo : dict or str
|
| 636 |
+
The set of references to use for this instance, with a structure as above.
|
| 637 |
+
If str referencing a JSON file, will use fsspec.open, in conjunction
|
| 638 |
+
with target_options and target_protocol to open and parse JSON at this
|
| 639 |
+
location. If a directory, then assume references are a set of parquet
|
| 640 |
+
files to be loaded lazily.
|
| 641 |
+
target : str
|
| 642 |
+
For any references having target_url as None, this is the default file
|
| 643 |
+
target to use
|
| 644 |
+
ref_storage_args : dict
|
| 645 |
+
If references is a str, use these kwargs for loading the JSON file.
|
| 646 |
+
Deprecated: use target_options instead.
|
| 647 |
+
target_protocol : str
|
| 648 |
+
Used for loading the reference file, if it is a path. If None, protocol
|
| 649 |
+
will be derived from the given path
|
| 650 |
+
target_options : dict
|
| 651 |
+
Extra FS options for loading the reference file ``fo``, if given as a path
|
| 652 |
+
remote_protocol : str
|
| 653 |
+
The protocol of the filesystem on which the references will be evaluated
|
| 654 |
+
(unless fs is provided). If not given, will be derived from the first
|
| 655 |
+
URL that has a protocol in the templates or in the references, in that
|
| 656 |
+
order.
|
| 657 |
+
remote_options : dict
|
| 658 |
+
kwargs to go with remote_protocol
|
| 659 |
+
fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
|
| 660 |
+
Directly provide a file system(s):
|
| 661 |
+
- a single filesystem instance
|
| 662 |
+
- a dict of protocol:filesystem, where each value is either a filesystem
|
| 663 |
+
instance, or a dict of kwargs that can be used to create in
|
| 664 |
+
instance for the given protocol
|
| 665 |
+
|
| 666 |
+
If this is given, remote_options and remote_protocol are ignored.
|
| 667 |
+
template_overrides : dict
|
| 668 |
+
Swap out any templates in the references file with these - useful for
|
| 669 |
+
testing.
|
| 670 |
+
simple_templates: bool
|
| 671 |
+
Whether templates can be processed with simple replace (True) or if
|
| 672 |
+
jinja is needed (False, much slower). All reference sets produced by
|
| 673 |
+
``kerchunk`` are simple in this sense, but the spec allows for complex.
|
| 674 |
+
max_gap, max_block: int
|
| 675 |
+
For merging multiple concurrent requests to the same remote file.
|
| 676 |
+
Neighboring byte ranges will only be merged when their
|
| 677 |
+
inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
|
| 678 |
+
to only merge when it requires no extra bytes. Pass a negative
|
| 679 |
+
number to disable merging, appropriate for local target files.
|
| 680 |
+
Neighboring byte ranges will only be merged when the size of
|
| 681 |
+
the aggregated range is <= ``max_block``. Default is 256MB.
|
| 682 |
+
cache_size : int
|
| 683 |
+
Maximum size of LRU cache, where cache_size*record_size denotes
|
| 684 |
+
the total number of references that can be loaded in memory at once.
|
| 685 |
+
Only used for lazily loaded references.
|
| 686 |
+
kwargs : passed to parent class
|
| 687 |
+
"""
|
| 688 |
+
super().__init__(**kwargs)
|
| 689 |
+
self.target = target
|
| 690 |
+
self.template_overrides = template_overrides
|
| 691 |
+
self.simple_templates = simple_templates
|
| 692 |
+
self.templates = {}
|
| 693 |
+
self.fss = {}
|
| 694 |
+
self._dircache = {}
|
| 695 |
+
self.max_gap = max_gap
|
| 696 |
+
self.max_block = max_block
|
| 697 |
+
if isinstance(fo, str):
|
| 698 |
+
dic = dict(
|
| 699 |
+
**(ref_storage_args or target_options or {}), protocol=target_protocol
|
| 700 |
+
)
|
| 701 |
+
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
|
| 702 |
+
if ref_fs.isfile(fo2):
|
| 703 |
+
# text JSON
|
| 704 |
+
with fsspec.open(fo, "rb", **dic) as f:
|
| 705 |
+
logger.info("Read reference from URL %s", fo)
|
| 706 |
+
text = json.load(f)
|
| 707 |
+
self._process_references(text, template_overrides)
|
| 708 |
+
else:
|
| 709 |
+
# Lazy parquet refs
|
| 710 |
+
logger.info("Open lazy reference dict from URL %s", fo)
|
| 711 |
+
self.references = LazyReferenceMapper(
|
| 712 |
+
fo2,
|
| 713 |
+
fs=ref_fs,
|
| 714 |
+
cache_size=cache_size,
|
| 715 |
+
)
|
| 716 |
+
else:
|
| 717 |
+
# dictionaries
|
| 718 |
+
self._process_references(fo, template_overrides)
|
| 719 |
+
if isinstance(fs, dict):
|
| 720 |
+
self.fss = {
|
| 721 |
+
k: (
|
| 722 |
+
fsspec.filesystem(k.split(":", 1)[0], **opts)
|
| 723 |
+
if isinstance(opts, dict)
|
| 724 |
+
else opts
|
| 725 |
+
)
|
| 726 |
+
for k, opts in fs.items()
|
| 727 |
+
}
|
| 728 |
+
if None not in self.fss:
|
| 729 |
+
self.fss[None] = filesystem("file")
|
| 730 |
+
return
|
| 731 |
+
if fs is not None:
|
| 732 |
+
# single remote FS
|
| 733 |
+
remote_protocol = (
|
| 734 |
+
fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
|
| 735 |
+
)
|
| 736 |
+
self.fss[remote_protocol] = fs
|
| 737 |
+
|
| 738 |
+
if remote_protocol is None:
|
| 739 |
+
# get single protocol from any templates
|
| 740 |
+
for ref in self.templates.values():
|
| 741 |
+
if callable(ref):
|
| 742 |
+
ref = ref()
|
| 743 |
+
protocol, _ = fsspec.core.split_protocol(ref)
|
| 744 |
+
if protocol and protocol not in self.fss:
|
| 745 |
+
fs = filesystem(protocol, **(remote_options or {}))
|
| 746 |
+
self.fss[protocol] = fs
|
| 747 |
+
if remote_protocol is None:
|
| 748 |
+
# get single protocol from references
|
| 749 |
+
# TODO: warning here, since this can be very expensive?
|
| 750 |
+
for ref in self.references.values():
|
| 751 |
+
if callable(ref):
|
| 752 |
+
ref = ref()
|
| 753 |
+
if isinstance(ref, list) and ref[0]:
|
| 754 |
+
protocol, _ = fsspec.core.split_protocol(ref[0])
|
| 755 |
+
if protocol not in self.fss:
|
| 756 |
+
fs = filesystem(protocol, **(remote_options or {}))
|
| 757 |
+
self.fss[protocol] = fs
|
| 758 |
+
# only use first remote URL
|
| 759 |
+
break
|
| 760 |
+
|
| 761 |
+
if remote_protocol and remote_protocol not in self.fss:
|
| 762 |
+
fs = filesystem(remote_protocol, **(remote_options or {}))
|
| 763 |
+
self.fss[remote_protocol] = fs
|
| 764 |
+
|
| 765 |
+
self.fss[None] = fs or filesystem("file") # default one
|
| 766 |
+
# Wrap any non-async filesystems to ensure async methods are available below
|
| 767 |
+
for k, f in self.fss.items():
|
| 768 |
+
if not f.async_impl:
|
| 769 |
+
self.fss[k] = AsyncFileSystemWrapper(f)
|
| 770 |
+
elif self.asynchronous ^ f.asynchronous:
|
| 771 |
+
raise ValueError(
|
| 772 |
+
"Reference-FS's target filesystem must have same value"
|
| 773 |
+
"of asynchronous"
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
def _cat_common(self, path, start=None, end=None):
|
| 777 |
+
path = self._strip_protocol(path)
|
| 778 |
+
logger.debug(f"cat: {path}")
|
| 779 |
+
try:
|
| 780 |
+
part = self.references[path]
|
| 781 |
+
except KeyError as exc:
|
| 782 |
+
raise FileNotFoundError(path) from exc
|
| 783 |
+
if isinstance(part, str):
|
| 784 |
+
part = part.encode()
|
| 785 |
+
if hasattr(part, "to_bytes"):
|
| 786 |
+
part = part.to_bytes()
|
| 787 |
+
if isinstance(part, bytes):
|
| 788 |
+
logger.debug(f"Reference: {path}, type bytes")
|
| 789 |
+
if part.startswith(b"base64:"):
|
| 790 |
+
part = base64.b64decode(part[7:])
|
| 791 |
+
return part, None, None
|
| 792 |
+
|
| 793 |
+
if len(part) == 1:
|
| 794 |
+
logger.debug(f"Reference: {path}, whole file => {part}")
|
| 795 |
+
url = part[0]
|
| 796 |
+
start1, end1 = start, end
|
| 797 |
+
else:
|
| 798 |
+
url, start0, size = part
|
| 799 |
+
logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
|
| 800 |
+
end0 = start0 + size
|
| 801 |
+
|
| 802 |
+
if start is not None:
|
| 803 |
+
if start >= 0:
|
| 804 |
+
start1 = start0 + start
|
| 805 |
+
else:
|
| 806 |
+
start1 = end0 + start
|
| 807 |
+
else:
|
| 808 |
+
start1 = start0
|
| 809 |
+
if end is not None:
|
| 810 |
+
if end >= 0:
|
| 811 |
+
end1 = start0 + end
|
| 812 |
+
else:
|
| 813 |
+
end1 = end0 + end
|
| 814 |
+
else:
|
| 815 |
+
end1 = end0
|
| 816 |
+
if url is None:
|
| 817 |
+
url = self.target
|
| 818 |
+
return url, start1, end1
|
| 819 |
+
|
| 820 |
+
async def _cat_file(self, path, start=None, end=None, **kwargs):
|
| 821 |
+
part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
|
| 822 |
+
if isinstance(part_or_url, bytes):
|
| 823 |
+
return part_or_url[start:end]
|
| 824 |
+
protocol, _ = split_protocol(part_or_url)
|
| 825 |
+
try:
|
| 826 |
+
return await self.fss[protocol]._cat_file(
|
| 827 |
+
part_or_url, start=start0, end=end0
|
| 828 |
+
)
|
| 829 |
+
except Exception as e:
|
| 830 |
+
raise ReferenceNotReachable(path, part_or_url) from e
|
| 831 |
+
|
| 832 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
| 833 |
+
part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
|
| 834 |
+
if isinstance(part_or_url, bytes):
|
| 835 |
+
return part_or_url[start:end]
|
| 836 |
+
protocol, _ = split_protocol(part_or_url)
|
| 837 |
+
try:
|
| 838 |
+
return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
|
| 839 |
+
except Exception as e:
|
| 840 |
+
raise ReferenceNotReachable(path, part_or_url) from e
|
| 841 |
+
|
| 842 |
+
def pipe_file(self, path, value, **_):
|
| 843 |
+
"""Temporarily add binary data or reference as a file"""
|
| 844 |
+
self.references[path] = value
|
| 845 |
+
|
| 846 |
+
async def _get_file(self, rpath, lpath, **kwargs):
|
| 847 |
+
if self.isdir(rpath):
|
| 848 |
+
return os.makedirs(lpath, exist_ok=True)
|
| 849 |
+
data = await self._cat_file(rpath)
|
| 850 |
+
with open(lpath, "wb") as f:
|
| 851 |
+
f.write(data)
|
| 852 |
+
|
| 853 |
+
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
|
| 854 |
+
if self.isdir(rpath):
|
| 855 |
+
return os.makedirs(lpath, exist_ok=True)
|
| 856 |
+
data = self.cat_file(rpath, **kwargs)
|
| 857 |
+
callback.set_size(len(data))
|
| 858 |
+
if isfilelike(lpath):
|
| 859 |
+
lpath.write(data)
|
| 860 |
+
else:
|
| 861 |
+
with open(lpath, "wb") as f:
|
| 862 |
+
f.write(data)
|
| 863 |
+
callback.absolute_update(len(data))
|
| 864 |
+
|
| 865 |
+
def get(self, rpath, lpath, recursive=False, **kwargs):
|
| 866 |
+
if recursive:
|
| 867 |
+
# trigger directory build
|
| 868 |
+
self.ls("")
|
| 869 |
+
rpath = self.expand_path(rpath, recursive=recursive)
|
| 870 |
+
fs = fsspec.filesystem("file", auto_mkdir=True)
|
| 871 |
+
targets = other_paths(rpath, lpath)
|
| 872 |
+
if recursive:
|
| 873 |
+
data = self.cat([r for r in rpath if not self.isdir(r)])
|
| 874 |
+
else:
|
| 875 |
+
data = self.cat(rpath)
|
| 876 |
+
for remote, local in zip(rpath, targets):
|
| 877 |
+
if remote in data:
|
| 878 |
+
fs.pipe_file(local, data[remote])
|
| 879 |
+
|
| 880 |
+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
| 881 |
+
if isinstance(path, str) and recursive:
|
| 882 |
+
raise NotImplementedError
|
| 883 |
+
if isinstance(path, list) and (recursive or any("*" in p for p in path)):
|
| 884 |
+
raise NotImplementedError
|
| 885 |
+
# TODO: if references is lazy, pre-fetch all paths in batch before access
|
| 886 |
+
proto_dict = _protocol_groups(path, self.references)
|
| 887 |
+
out = {}
|
| 888 |
+
for proto, paths in proto_dict.items():
|
| 889 |
+
fs = self.fss[proto]
|
| 890 |
+
urls, starts, ends, valid_paths = [], [], [], []
|
| 891 |
+
for p in paths:
|
| 892 |
+
# find references or label not-found. Early exit if any not
|
| 893 |
+
# found and on_error is "raise"
|
| 894 |
+
try:
|
| 895 |
+
u, s, e = self._cat_common(p)
|
| 896 |
+
if not isinstance(u, (bytes, str)):
|
| 897 |
+
# nan/None from parquet
|
| 898 |
+
continue
|
| 899 |
+
except FileNotFoundError as err:
|
| 900 |
+
if on_error == "raise":
|
| 901 |
+
raise
|
| 902 |
+
if on_error != "omit":
|
| 903 |
+
out[p] = err
|
| 904 |
+
else:
|
| 905 |
+
urls.append(u)
|
| 906 |
+
starts.append(s)
|
| 907 |
+
ends.append(e)
|
| 908 |
+
valid_paths.append(p)
|
| 909 |
+
|
| 910 |
+
# process references into form for merging
|
| 911 |
+
urls2 = []
|
| 912 |
+
starts2 = []
|
| 913 |
+
ends2 = []
|
| 914 |
+
paths2 = []
|
| 915 |
+
whole_files = set()
|
| 916 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
| 917 |
+
if isinstance(u, bytes):
|
| 918 |
+
# data
|
| 919 |
+
out[p] = u
|
| 920 |
+
elif s is None:
|
| 921 |
+
# whole file - limits are None, None, but no further
|
| 922 |
+
# entries take for this file
|
| 923 |
+
whole_files.add(u)
|
| 924 |
+
urls2.append(u)
|
| 925 |
+
starts2.append(s)
|
| 926 |
+
ends2.append(e)
|
| 927 |
+
paths2.append(p)
|
| 928 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
| 929 |
+
# second run to account for files that are to be loaded whole
|
| 930 |
+
if s is not None and u not in whole_files:
|
| 931 |
+
urls2.append(u)
|
| 932 |
+
starts2.append(s)
|
| 933 |
+
ends2.append(e)
|
| 934 |
+
paths2.append(p)
|
| 935 |
+
|
| 936 |
+
# merge and fetch consolidated ranges
|
| 937 |
+
new_paths, new_starts, new_ends = merge_offset_ranges(
|
| 938 |
+
list(urls2),
|
| 939 |
+
list(starts2),
|
| 940 |
+
list(ends2),
|
| 941 |
+
sort=True,
|
| 942 |
+
max_gap=self.max_gap,
|
| 943 |
+
max_block=self.max_block,
|
| 944 |
+
)
|
| 945 |
+
bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
|
| 946 |
+
|
| 947 |
+
# unbundle from merged bytes - simple approach
|
| 948 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
| 949 |
+
if p in out:
|
| 950 |
+
continue # was bytes, already handled
|
| 951 |
+
for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
|
| 952 |
+
if np == u and (ns is None or ne is None):
|
| 953 |
+
if isinstance(b, Exception):
|
| 954 |
+
out[p] = b
|
| 955 |
+
else:
|
| 956 |
+
out[p] = b[s:e]
|
| 957 |
+
elif np == u and s >= ns and e <= ne:
|
| 958 |
+
if isinstance(b, Exception):
|
| 959 |
+
out[p] = b
|
| 960 |
+
else:
|
| 961 |
+
out[p] = b[s - ns : (e - ne) or None]
|
| 962 |
+
|
| 963 |
+
for k, v in out.copy().items():
|
| 964 |
+
# these were valid references, but fetch failed, so transform exc
|
| 965 |
+
if isinstance(v, Exception) and k in self.references:
|
| 966 |
+
ex = out[k]
|
| 967 |
+
new_ex = ReferenceNotReachable(k, self.references[k])
|
| 968 |
+
new_ex.__cause__ = ex
|
| 969 |
+
if on_error == "raise":
|
| 970 |
+
raise new_ex
|
| 971 |
+
elif on_error != "omit":
|
| 972 |
+
out[k] = new_ex
|
| 973 |
+
|
| 974 |
+
if len(out) == 1 and isinstance(path, str) and "*" not in path:
|
| 975 |
+
return _first(out)
|
| 976 |
+
return out
|
| 977 |
+
|
| 978 |
+
def _process_references(self, references, template_overrides=None):
|
| 979 |
+
vers = references.get("version", None)
|
| 980 |
+
if vers is None:
|
| 981 |
+
self._process_references0(references)
|
| 982 |
+
elif vers == 1:
|
| 983 |
+
self._process_references1(references, template_overrides=template_overrides)
|
| 984 |
+
else:
|
| 985 |
+
raise ValueError(f"Unknown reference spec version: {vers}")
|
| 986 |
+
# TODO: we make dircache by iterating over all entries, but for Spec >= 1,
|
| 987 |
+
# can replace with programmatic. Is it even needed for mapper interface?
|
| 988 |
+
|
| 989 |
+
def _process_references0(self, references):
|
| 990 |
+
"""Make reference dict for Spec Version 0"""
|
| 991 |
+
if isinstance(references, dict):
|
| 992 |
+
# do not do this for lazy/parquet backend, which will not make dicts,
|
| 993 |
+
# but must remain writable in the original object
|
| 994 |
+
references = {
|
| 995 |
+
key: json.dumps(val) if isinstance(val, dict) else val
|
| 996 |
+
for key, val in references.items()
|
| 997 |
+
}
|
| 998 |
+
self.references = references
|
| 999 |
+
|
| 1000 |
+
def _process_references1(self, references, template_overrides=None):
|
| 1001 |
+
if not self.simple_templates or self.templates:
|
| 1002 |
+
import jinja2
|
| 1003 |
+
self.references = {}
|
| 1004 |
+
self._process_templates(references.get("templates", {}))
|
| 1005 |
+
|
| 1006 |
+
@lru_cache(1000)
|
| 1007 |
+
def _render_jinja(u):
|
| 1008 |
+
return jinja2.Template(u).render(**self.templates)
|
| 1009 |
+
|
| 1010 |
+
for k, v in references.get("refs", {}).items():
|
| 1011 |
+
if isinstance(v, str):
|
| 1012 |
+
if v.startswith("base64:"):
|
| 1013 |
+
self.references[k] = base64.b64decode(v[7:])
|
| 1014 |
+
self.references[k] = v
|
| 1015 |
+
elif isinstance(v, dict):
|
| 1016 |
+
self.references[k] = json.dumps(v)
|
| 1017 |
+
elif self.templates:
|
| 1018 |
+
u = v[0]
|
| 1019 |
+
if "{{" in u:
|
| 1020 |
+
if self.simple_templates:
|
| 1021 |
+
u = (
|
| 1022 |
+
u.replace("{{", "{")
|
| 1023 |
+
.replace("}}", "}")
|
| 1024 |
+
.format(**self.templates)
|
| 1025 |
+
)
|
| 1026 |
+
else:
|
| 1027 |
+
u = _render_jinja(u)
|
| 1028 |
+
self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
|
| 1029 |
+
else:
|
| 1030 |
+
self.references[k] = v
|
| 1031 |
+
self.references.update(self._process_gen(references.get("gen", [])))
|
| 1032 |
+
|
| 1033 |
+
def _process_templates(self, tmp):
|
| 1034 |
+
self.templates = {}
|
| 1035 |
+
if self.template_overrides is not None:
|
| 1036 |
+
tmp.update(self.template_overrides)
|
| 1037 |
+
for k, v in tmp.items():
|
| 1038 |
+
if "{{" in v:
|
| 1039 |
+
import jinja2
|
| 1040 |
+
|
| 1041 |
+
self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
|
| 1042 |
+
temp
|
| 1043 |
+
).render(**kwargs)
|
| 1044 |
+
else:
|
| 1045 |
+
self.templates[k] = v
|
| 1046 |
+
|
| 1047 |
+
def _process_gen(self, gens):
|
| 1048 |
+
out = {}
|
| 1049 |
+
for gen in gens:
|
| 1050 |
+
dimension = {
|
| 1051 |
+
k: (
|
| 1052 |
+
v
|
| 1053 |
+
if isinstance(v, list)
|
| 1054 |
+
else range(v.get("start", 0), v["stop"], v.get("step", 1))
|
| 1055 |
+
)
|
| 1056 |
+
for k, v in gen["dimensions"].items()
|
| 1057 |
+
}
|
| 1058 |
+
products = (
|
| 1059 |
+
dict(zip(dimension.keys(), values))
|
| 1060 |
+
for values in itertools.product(*dimension.values())
|
| 1061 |
+
)
|
| 1062 |
+
for pr in products:
|
| 1063 |
+
import jinja2
|
| 1064 |
+
|
| 1065 |
+
key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
|
| 1066 |
+
url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
|
| 1067 |
+
if ("offset" in gen) and ("length" in gen):
|
| 1068 |
+
offset = int(
|
| 1069 |
+
jinja2.Template(gen["offset"]).render(**pr, **self.templates)
|
| 1070 |
+
)
|
| 1071 |
+
length = int(
|
| 1072 |
+
jinja2.Template(gen["length"]).render(**pr, **self.templates)
|
| 1073 |
+
)
|
| 1074 |
+
out[key] = [url, offset, length]
|
| 1075 |
+
elif ("offset" in gen) ^ ("length" in gen):
|
| 1076 |
+
raise ValueError(
|
| 1077 |
+
"Both 'offset' and 'length' are required for a "
|
| 1078 |
+
"reference generator entry if either is provided."
|
| 1079 |
+
)
|
| 1080 |
+
else:
|
| 1081 |
+
out[key] = [url]
|
| 1082 |
+
return out
|
| 1083 |
+
|
| 1084 |
+
def _dircache_from_items(self):
|
| 1085 |
+
self.dircache = {"": []}
|
| 1086 |
+
it = self.references.items()
|
| 1087 |
+
for path, part in it:
|
| 1088 |
+
if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
|
| 1089 |
+
size = len(part)
|
| 1090 |
+
elif len(part) == 1:
|
| 1091 |
+
size = None
|
| 1092 |
+
else:
|
| 1093 |
+
_, _, size = part
|
| 1094 |
+
par = path.rsplit("/", 1)[0] if "/" in path else ""
|
| 1095 |
+
par0 = par
|
| 1096 |
+
subdirs = [par0]
|
| 1097 |
+
while par0 and par0 not in self.dircache:
|
| 1098 |
+
# collect parent directories
|
| 1099 |
+
par0 = self._parent(par0)
|
| 1100 |
+
subdirs.append(par0)
|
| 1101 |
+
|
| 1102 |
+
subdirs.reverse()
|
| 1103 |
+
for parent, child in zip(subdirs, subdirs[1:]):
|
| 1104 |
+
# register newly discovered directories
|
| 1105 |
+
assert child not in self.dircache
|
| 1106 |
+
assert parent in self.dircache
|
| 1107 |
+
self.dircache[parent].append(
|
| 1108 |
+
{"name": child, "type": "directory", "size": 0}
|
| 1109 |
+
)
|
| 1110 |
+
self.dircache[child] = []
|
| 1111 |
+
|
| 1112 |
+
self.dircache[par].append({"name": path, "type": "file", "size": size})
|
| 1113 |
+
|
| 1114 |
+
def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
|
| 1115 |
+
part_or_url, start0, end0 = self._cat_common(path)
|
| 1116 |
+
# This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
|
| 1117 |
+
# That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
|
| 1118 |
+
if isinstance(part_or_url, bytes):
|
| 1119 |
+
return io.BytesIO(part_or_url[start0:end0])
|
| 1120 |
+
|
| 1121 |
+
protocol, _ = split_protocol(part_or_url)
|
| 1122 |
+
if start0 is None and end0 is None:
|
| 1123 |
+
return self.fss[protocol]._open(
|
| 1124 |
+
part_or_url,
|
| 1125 |
+
mode,
|
| 1126 |
+
block_size=block_size,
|
| 1127 |
+
cache_options=cache_options,
|
| 1128 |
+
**kwargs,
|
| 1129 |
+
)
|
| 1130 |
+
|
| 1131 |
+
return ReferenceFile(
|
| 1132 |
+
self,
|
| 1133 |
+
path,
|
| 1134 |
+
mode,
|
| 1135 |
+
block_size=block_size,
|
| 1136 |
+
cache_options=cache_options,
|
| 1137 |
+
**kwargs,
|
| 1138 |
+
)
|
| 1139 |
+
|
| 1140 |
+
def ls(self, path, detail=True, **kwargs):
|
| 1141 |
+
logger.debug("list %s", path)
|
| 1142 |
+
path = self._strip_protocol(path)
|
| 1143 |
+
if isinstance(self.references, LazyReferenceMapper):
|
| 1144 |
+
try:
|
| 1145 |
+
return self.references.ls(path, detail)
|
| 1146 |
+
except KeyError:
|
| 1147 |
+
pass
|
| 1148 |
+
raise FileNotFoundError(f"'{path}' is not a known key")
|
| 1149 |
+
if not self.dircache:
|
| 1150 |
+
self._dircache_from_items()
|
| 1151 |
+
out = self._ls_from_cache(path)
|
| 1152 |
+
if out is None:
|
| 1153 |
+
raise FileNotFoundError(path)
|
| 1154 |
+
if detail:
|
| 1155 |
+
return out
|
| 1156 |
+
return [o["name"] for o in out]
|
| 1157 |
+
|
| 1158 |
+
def exists(self, path, **kwargs): # overwrite auto-sync version
|
| 1159 |
+
return self.isdir(path) or self.isfile(path)
|
| 1160 |
+
|
| 1161 |
+
def isdir(self, path): # overwrite auto-sync version
|
| 1162 |
+
if self.dircache:
|
| 1163 |
+
return path in self.dircache
|
| 1164 |
+
elif isinstance(self.references, LazyReferenceMapper):
|
| 1165 |
+
return path in self.references.listdir()
|
| 1166 |
+
else:
|
| 1167 |
+
# this may be faster than building dircache for single calls, but
|
| 1168 |
+
# by looping will be slow for many calls; could cache it?
|
| 1169 |
+
return any(_.startswith(f"{path}/") for _ in self.references)
|
| 1170 |
+
|
| 1171 |
+
def isfile(self, path): # overwrite auto-sync version
|
| 1172 |
+
return path in self.references
|
| 1173 |
+
|
| 1174 |
+
async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
|
| 1175 |
+
return self.ls(path, detail, **kwargs)
|
| 1176 |
+
|
| 1177 |
+
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
| 1178 |
+
if withdirs:
|
| 1179 |
+
return super().find(
|
| 1180 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
|
| 1181 |
+
)
|
| 1182 |
+
if path:
|
| 1183 |
+
path = self._strip_protocol(path)
|
| 1184 |
+
r = sorted(k for k in self.references if k.startswith(path))
|
| 1185 |
+
else:
|
| 1186 |
+
r = sorted(self.references)
|
| 1187 |
+
if detail:
|
| 1188 |
+
if not self.dircache:
|
| 1189 |
+
self._dircache_from_items()
|
| 1190 |
+
return {k: self._ls_from_cache(k)[0] for k in r}
|
| 1191 |
+
else:
|
| 1192 |
+
return r
|
| 1193 |
+
|
| 1194 |
+
def info(self, path, **kwargs):
|
| 1195 |
+
out = self.references.get(path)
|
| 1196 |
+
if out is not None:
|
| 1197 |
+
if isinstance(out, (str, bytes)):
|
| 1198 |
+
# decode base64 here
|
| 1199 |
+
return {"name": path, "type": "file", "size": len(out)}
|
| 1200 |
+
elif len(out) > 1:
|
| 1201 |
+
return {"name": path, "type": "file", "size": out[2]}
|
| 1202 |
+
else:
|
| 1203 |
+
out0 = [{"name": path, "type": "file", "size": None}]
|
| 1204 |
+
else:
|
| 1205 |
+
out = self.ls(path, True)
|
| 1206 |
+
out0 = [o for o in out if o["name"] == path]
|
| 1207 |
+
if not out0:
|
| 1208 |
+
return {"name": path, "type": "directory", "size": 0}
|
| 1209 |
+
if out0[0]["size"] is None:
|
| 1210 |
+
# if this is a whole remote file, update size using remote FS
|
| 1211 |
+
prot, _ = split_protocol(self.references[path][0])
|
| 1212 |
+
out0[0]["size"] = self.fss[prot].size(self.references[path][0])
|
| 1213 |
+
return out0[0]
|
| 1214 |
+
|
| 1215 |
+
async def _info(self, path, **kwargs): # calls fast sync code
|
| 1216 |
+
return self.info(path)
|
| 1217 |
+
|
| 1218 |
+
async def _rm_file(self, path, **kwargs):
|
| 1219 |
+
self.references.pop(
|
| 1220 |
+
path, None
|
| 1221 |
+
) # ignores FileNotFound, just as well for directories
|
| 1222 |
+
self.dircache.clear() # this is a bit heavy handed
|
| 1223 |
+
|
| 1224 |
+
async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
|
| 1225 |
+
if mode == "create" and self.exists(path):
|
| 1226 |
+
raise FileExistsError
|
| 1227 |
+
# can be str or bytes
|
| 1228 |
+
self.references[path] = data
|
| 1229 |
+
self.dircache.clear() # this is a bit heavy handed
|
| 1230 |
+
|
| 1231 |
+
async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
|
| 1232 |
+
# puts binary
|
| 1233 |
+
if mode == "create" and self.exists(rpath):
|
| 1234 |
+
raise FileExistsError
|
| 1235 |
+
with open(lpath, "rb") as f:
|
| 1236 |
+
self.references[rpath] = f.read()
|
| 1237 |
+
self.dircache.clear() # this is a bit heavy handed
|
| 1238 |
+
|
| 1239 |
+
def save_json(self, url, **storage_options):
|
| 1240 |
+
"""Write modified references into new location"""
|
| 1241 |
+
out = {}
|
| 1242 |
+
for k, v in self.references.items():
|
| 1243 |
+
if isinstance(v, bytes):
|
| 1244 |
+
try:
|
| 1245 |
+
out[k] = v.decode("ascii")
|
| 1246 |
+
except UnicodeDecodeError:
|
| 1247 |
+
out[k] = (b"base64:" + base64.b64encode(v)).decode()
|
| 1248 |
+
else:
|
| 1249 |
+
out[k] = v
|
| 1250 |
+
with fsspec.open(url, "wb", **storage_options) as f:
|
| 1251 |
+
f.write(json.dumps({"version": 1, "refs": out}).encode())
|
| 1252 |
+
|
| 1253 |
+
|
| 1254 |
+
class ReferenceFile(AbstractBufferedFile):
|
| 1255 |
+
def __init__(
|
| 1256 |
+
self,
|
| 1257 |
+
fs,
|
| 1258 |
+
path,
|
| 1259 |
+
mode="rb",
|
| 1260 |
+
block_size="default",
|
| 1261 |
+
autocommit=True,
|
| 1262 |
+
cache_type="readahead",
|
| 1263 |
+
cache_options=None,
|
| 1264 |
+
size=None,
|
| 1265 |
+
**kwargs,
|
| 1266 |
+
):
|
| 1267 |
+
super().__init__(
|
| 1268 |
+
fs,
|
| 1269 |
+
path,
|
| 1270 |
+
mode=mode,
|
| 1271 |
+
block_size=block_size,
|
| 1272 |
+
autocommit=autocommit,
|
| 1273 |
+
size=size,
|
| 1274 |
+
cache_type=cache_type,
|
| 1275 |
+
cache_options=cache_options,
|
| 1276 |
+
**kwargs,
|
| 1277 |
+
)
|
| 1278 |
+
part_or_url, self.start, self.end = self.fs._cat_common(self.path)
|
| 1279 |
+
protocol, _ = split_protocol(part_or_url)
|
| 1280 |
+
self.src_fs = self.fs.fss[protocol]
|
| 1281 |
+
self.src_path = part_or_url
|
| 1282 |
+
self._f = None
|
| 1283 |
+
|
| 1284 |
+
@property
|
| 1285 |
+
def f(self):
|
| 1286 |
+
if self._f is None or self._f.closed:
|
| 1287 |
+
self._f = self.src_fs._open(
|
| 1288 |
+
self.src_path,
|
| 1289 |
+
mode=self.mode,
|
| 1290 |
+
block_size=self.blocksize,
|
| 1291 |
+
autocommit=self.autocommit,
|
| 1292 |
+
cache_type="none",
|
| 1293 |
+
**self.kwargs,
|
| 1294 |
+
)
|
| 1295 |
+
return self._f
|
| 1296 |
+
|
| 1297 |
+
def close(self):
|
| 1298 |
+
if self._f is not None:
|
| 1299 |
+
self._f.close()
|
| 1300 |
+
return super().close()
|
| 1301 |
+
|
| 1302 |
+
def _fetch_range(self, start, end):
|
| 1303 |
+
start = start + self.start
|
| 1304 |
+
end = min(end + self.start, self.end)
|
| 1305 |
+
self.f.seek(start)
|
| 1306 |
+
return self.f.read(end - start)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import types
|
| 5 |
+
import uuid
|
| 6 |
+
from stat import S_ISDIR, S_ISLNK
|
| 7 |
+
|
| 8 |
+
import paramiko
|
| 9 |
+
|
| 10 |
+
from .. import AbstractFileSystem
|
| 11 |
+
from ..utils import infer_storage_options
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("fsspec.sftp")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SFTPFileSystem(AbstractFileSystem):
|
| 17 |
+
"""Files over SFTP/SSH
|
| 18 |
+
|
| 19 |
+
Peer-to-peer filesystem over SSH using paramiko.
|
| 20 |
+
|
| 21 |
+
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
| 22 |
+
there is no way to tell if a path is relative, so all paths are assumed
|
| 23 |
+
to be absolute.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
protocol = "sftp", "ssh"
|
| 27 |
+
|
| 28 |
+
def __init__(self, host, **ssh_kwargs):
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
Parameters
|
| 32 |
+
----------
|
| 33 |
+
host: str
|
| 34 |
+
Hostname or IP as a string
|
| 35 |
+
temppath: str
|
| 36 |
+
Location on the server to put files, when within a transaction
|
| 37 |
+
ssh_kwargs: dict
|
| 38 |
+
Parameters passed on to connection. See details in
|
| 39 |
+
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
|
| 40 |
+
May include port, username, password...
|
| 41 |
+
"""
|
| 42 |
+
if self._cached:
|
| 43 |
+
return
|
| 44 |
+
super().__init__(**ssh_kwargs)
|
| 45 |
+
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
|
| 46 |
+
self.host = host
|
| 47 |
+
self.ssh_kwargs = ssh_kwargs
|
| 48 |
+
self._connect()
|
| 49 |
+
|
| 50 |
+
def _connect(self):
|
| 51 |
+
logger.debug("Connecting to SFTP server %s", self.host)
|
| 52 |
+
self.client = paramiko.SSHClient()
|
| 53 |
+
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
| 54 |
+
self.client.connect(self.host, **self.ssh_kwargs)
|
| 55 |
+
self.ftp = self.client.open_sftp()
|
| 56 |
+
|
| 57 |
+
@classmethod
|
| 58 |
+
def _strip_protocol(cls, path):
|
| 59 |
+
return infer_storage_options(path)["path"]
|
| 60 |
+
|
| 61 |
+
@staticmethod
|
| 62 |
+
def _get_kwargs_from_urls(urlpath):
|
| 63 |
+
out = infer_storage_options(urlpath)
|
| 64 |
+
out.pop("path", None)
|
| 65 |
+
out.pop("protocol", None)
|
| 66 |
+
return out
|
| 67 |
+
|
| 68 |
+
def mkdir(self, path, create_parents=True, mode=511):
|
| 69 |
+
logger.debug("Creating folder %s", path)
|
| 70 |
+
if self.exists(path):
|
| 71 |
+
raise FileExistsError(f"File exists: {path}")
|
| 72 |
+
|
| 73 |
+
if create_parents:
|
| 74 |
+
self.makedirs(path)
|
| 75 |
+
else:
|
| 76 |
+
self.ftp.mkdir(path, mode)
|
| 77 |
+
|
| 78 |
+
def makedirs(self, path, exist_ok=False, mode=511):
|
| 79 |
+
if self.exists(path) and not exist_ok:
|
| 80 |
+
raise FileExistsError(f"File exists: {path}")
|
| 81 |
+
|
| 82 |
+
parts = path.split("/")
|
| 83 |
+
new_path = "/" if path[:1] == "/" else ""
|
| 84 |
+
|
| 85 |
+
for part in parts:
|
| 86 |
+
if part:
|
| 87 |
+
new_path = f"{new_path}/{part}" if new_path else part
|
| 88 |
+
if not self.exists(new_path):
|
| 89 |
+
self.ftp.mkdir(new_path, mode)
|
| 90 |
+
|
| 91 |
+
def rmdir(self, path):
|
| 92 |
+
logger.debug("Removing folder %s", path)
|
| 93 |
+
self.ftp.rmdir(path)
|
| 94 |
+
|
| 95 |
+
def info(self, path):
|
| 96 |
+
stat = self._decode_stat(self.ftp.stat(path))
|
| 97 |
+
stat["name"] = path
|
| 98 |
+
return stat
|
| 99 |
+
|
| 100 |
+
@staticmethod
|
| 101 |
+
def _decode_stat(stat, parent_path=None):
|
| 102 |
+
if S_ISDIR(stat.st_mode):
|
| 103 |
+
t = "directory"
|
| 104 |
+
elif S_ISLNK(stat.st_mode):
|
| 105 |
+
t = "link"
|
| 106 |
+
else:
|
| 107 |
+
t = "file"
|
| 108 |
+
out = {
|
| 109 |
+
"name": "",
|
| 110 |
+
"size": stat.st_size,
|
| 111 |
+
"type": t,
|
| 112 |
+
"uid": stat.st_uid,
|
| 113 |
+
"gid": stat.st_gid,
|
| 114 |
+
"time": datetime.datetime.fromtimestamp(
|
| 115 |
+
stat.st_atime, tz=datetime.timezone.utc
|
| 116 |
+
),
|
| 117 |
+
"mtime": datetime.datetime.fromtimestamp(
|
| 118 |
+
stat.st_mtime, tz=datetime.timezone.utc
|
| 119 |
+
),
|
| 120 |
+
}
|
| 121 |
+
if parent_path:
|
| 122 |
+
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
|
| 123 |
+
return out
|
| 124 |
+
|
| 125 |
+
def ls(self, path, detail=False):
|
| 126 |
+
logger.debug("Listing folder %s", path)
|
| 127 |
+
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
| 128 |
+
if detail:
|
| 129 |
+
return stats
|
| 130 |
+
else:
|
| 131 |
+
paths = [stat["name"] for stat in stats]
|
| 132 |
+
return sorted(paths)
|
| 133 |
+
|
| 134 |
+
def put(self, lpath, rpath, callback=None, **kwargs):
|
| 135 |
+
logger.debug("Put file %s into %s", lpath, rpath)
|
| 136 |
+
self.ftp.put(lpath, rpath)
|
| 137 |
+
|
| 138 |
+
def get_file(self, rpath, lpath, **kwargs):
|
| 139 |
+
if self.isdir(rpath):
|
| 140 |
+
os.makedirs(lpath, exist_ok=True)
|
| 141 |
+
else:
|
| 142 |
+
self.ftp.get(self._strip_protocol(rpath), lpath)
|
| 143 |
+
|
| 144 |
+
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
| 145 |
+
"""
|
| 146 |
+
block_size: int or None
|
| 147 |
+
If 0, no buffering, if 1, line buffering, if >1, buffer that many
|
| 148 |
+
bytes, if None use default from paramiko.
|
| 149 |
+
"""
|
| 150 |
+
logger.debug("Opening file %s", path)
|
| 151 |
+
if kwargs.get("autocommit", True) is False:
|
| 152 |
+
# writes to temporary file, move on commit
|
| 153 |
+
path2 = "/".join([self.temppath, str(uuid.uuid4())])
|
| 154 |
+
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
|
| 155 |
+
f.temppath = path2
|
| 156 |
+
f.targetpath = path
|
| 157 |
+
f.fs = self
|
| 158 |
+
f.commit = types.MethodType(commit_a_file, f)
|
| 159 |
+
f.discard = types.MethodType(discard_a_file, f)
|
| 160 |
+
else:
|
| 161 |
+
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
|
| 162 |
+
return f
|
| 163 |
+
|
| 164 |
+
def _rm(self, path):
|
| 165 |
+
if self.isdir(path):
|
| 166 |
+
self.ftp.rmdir(path)
|
| 167 |
+
else:
|
| 168 |
+
self.ftp.remove(path)
|
| 169 |
+
|
| 170 |
+
def mv(self, old, new):
|
| 171 |
+
logger.debug("Renaming %s into %s", old, new)
|
| 172 |
+
self.ftp.posix_rename(old, new)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def commit_a_file(self):
|
| 176 |
+
self.fs.mv(self.temppath, self.targetpath)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def discard_a_file(self):
|
| 180 |
+
self.fs._rm(self.temppath)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import tarfile
|
| 3 |
+
|
| 4 |
+
import fsspec
|
| 5 |
+
from fsspec.archive import AbstractArchiveFileSystem
|
| 6 |
+
from fsspec.compression import compr
|
| 7 |
+
from fsspec.utils import infer_compression
|
| 8 |
+
|
| 9 |
+
typemap = {b"0": "file", b"5": "directory"}
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("tar")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TarFileSystem(AbstractArchiveFileSystem):
|
| 15 |
+
"""Compressed Tar archives as a file-system (read-only)
|
| 16 |
+
|
| 17 |
+
Supports the following formats:
|
| 18 |
+
tar.gz, tar.bz2, tar.xz
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
root_marker = ""
|
| 22 |
+
protocol = "tar"
|
| 23 |
+
cachable = False
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
fo="",
|
| 28 |
+
index_store=None,
|
| 29 |
+
target_options=None,
|
| 30 |
+
target_protocol=None,
|
| 31 |
+
compression=None,
|
| 32 |
+
**kwargs,
|
| 33 |
+
):
|
| 34 |
+
super().__init__(**kwargs)
|
| 35 |
+
target_options = target_options or {}
|
| 36 |
+
|
| 37 |
+
if isinstance(fo, str):
|
| 38 |
+
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
| 39 |
+
fo = self.of.open() # keep the reference
|
| 40 |
+
|
| 41 |
+
# Try to infer compression.
|
| 42 |
+
if compression is None:
|
| 43 |
+
name = None
|
| 44 |
+
|
| 45 |
+
# Try different ways to get hold of the filename. `fo` might either
|
| 46 |
+
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
| 47 |
+
# `fsspec.AbstractFileSystem` instance.
|
| 48 |
+
try:
|
| 49 |
+
# Amended io.BufferedReader or similar.
|
| 50 |
+
# This uses a "protocol extension" where original filenames are
|
| 51 |
+
# propagated to archive-like filesystems in order to let them
|
| 52 |
+
# infer the right compression appropriately.
|
| 53 |
+
if hasattr(fo, "original"):
|
| 54 |
+
name = fo.original
|
| 55 |
+
|
| 56 |
+
# fsspec.LocalFileOpener
|
| 57 |
+
elif hasattr(fo, "path"):
|
| 58 |
+
name = fo.path
|
| 59 |
+
|
| 60 |
+
# io.BufferedReader
|
| 61 |
+
elif hasattr(fo, "name"):
|
| 62 |
+
name = fo.name
|
| 63 |
+
|
| 64 |
+
# fsspec.AbstractFileSystem
|
| 65 |
+
elif hasattr(fo, "info"):
|
| 66 |
+
name = fo.info()["name"]
|
| 67 |
+
|
| 68 |
+
except Exception as ex:
|
| 69 |
+
logger.warning(
|
| 70 |
+
f"Unable to determine file name, not inferring compression: {ex}"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
if name is not None:
|
| 74 |
+
compression = infer_compression(name)
|
| 75 |
+
logger.info(f"Inferred compression {compression} from file name {name}")
|
| 76 |
+
|
| 77 |
+
if compression is not None:
|
| 78 |
+
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
| 79 |
+
# but then would seek to offset in the file work?
|
| 80 |
+
fo = compr[compression](fo)
|
| 81 |
+
|
| 82 |
+
self._fo_ref = fo
|
| 83 |
+
self.fo = fo # the whole instance is a context
|
| 84 |
+
self.tar = tarfile.TarFile(fileobj=self.fo)
|
| 85 |
+
self.dir_cache = None
|
| 86 |
+
|
| 87 |
+
self.index_store = index_store
|
| 88 |
+
self.index = None
|
| 89 |
+
self._index()
|
| 90 |
+
|
| 91 |
+
def _index(self):
|
| 92 |
+
# TODO: load and set saved index, if exists
|
| 93 |
+
out = {}
|
| 94 |
+
for ti in self.tar:
|
| 95 |
+
info = ti.get_info()
|
| 96 |
+
info["type"] = typemap.get(info["type"], "file")
|
| 97 |
+
name = ti.get_info()["name"].rstrip("/")
|
| 98 |
+
out[name] = (info, ti.offset_data)
|
| 99 |
+
|
| 100 |
+
self.index = out
|
| 101 |
+
# TODO: save index to self.index_store here, if set
|
| 102 |
+
|
| 103 |
+
def _get_dirs(self):
|
| 104 |
+
if self.dir_cache is not None:
|
| 105 |
+
return
|
| 106 |
+
|
| 107 |
+
# This enables ls to get directories as children as well as files
|
| 108 |
+
self.dir_cache = {
|
| 109 |
+
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
| 110 |
+
for dirname in self._all_dirnames(self.tar.getnames())
|
| 111 |
+
}
|
| 112 |
+
for member in self.tar.getmembers():
|
| 113 |
+
info = member.get_info()
|
| 114 |
+
info["name"] = info["name"].rstrip("/")
|
| 115 |
+
info["type"] = typemap.get(info["type"], "file")
|
| 116 |
+
self.dir_cache[info["name"]] = info
|
| 117 |
+
|
| 118 |
+
def _open(self, path, mode="rb", **kwargs):
|
| 119 |
+
if mode != "rb":
|
| 120 |
+
raise ValueError("Read-only filesystem implementation")
|
| 121 |
+
details, offset = self.index[path]
|
| 122 |
+
if details["type"] != "file":
|
| 123 |
+
raise ValueError("Can only handle regular files")
|
| 124 |
+
return self.tar.extractfile(path)
|
.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import secrets
|
| 6 |
+
import shutil
|
| 7 |
+
import tempfile
|
| 8 |
+
import uuid
|
| 9 |
+
from contextlib import suppress
|
| 10 |
+
from urllib.parse import quote
|
| 11 |
+
|
| 12 |
+
import requests
|
| 13 |
+
|
| 14 |
+
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
| 15 |
+
from ..utils import infer_storage_options, tokenize
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("webhdfs")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class WebHDFS(AbstractFileSystem):
|
| 21 |
+
"""
|
| 22 |
+
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
| 23 |
+
|
| 24 |
+
Four auth mechanisms are supported:
|
| 25 |
+
|
| 26 |
+
insecure: no auth is done, and the user is assumed to be whoever they
|
| 27 |
+
say they are (parameter ``user``), or a predefined value such as
|
| 28 |
+
"dr.who" if not given
|
| 29 |
+
spnego: when kerberos authentication is enabled, auth is negotiated by
|
| 30 |
+
requests_kerberos https://github.com/requests/requests-kerberos .
|
| 31 |
+
This establishes a session based on existing kinit login and/or
|
| 32 |
+
specified principal/password; parameters are passed with ``kerb_kwargs``
|
| 33 |
+
token: uses an existing Hadoop delegation token from another secured
|
| 34 |
+
service. Indeed, this client can also generate such tokens when
|
| 35 |
+
not insecure. Note that tokens expire, but can be renewed (by a
|
| 36 |
+
previously specified user) and may allow for proxying.
|
| 37 |
+
basic-auth: used when both parameter ``user`` and parameter ``password``
|
| 38 |
+
are provided.
|
| 39 |
+
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
tempdir = str(tempfile.gettempdir())
|
| 43 |
+
protocol = "webhdfs", "webHDFS"
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
host,
|
| 48 |
+
port=50070,
|
| 49 |
+
kerberos=False,
|
| 50 |
+
token=None,
|
| 51 |
+
user=None,
|
| 52 |
+
password=None,
|
| 53 |
+
proxy_to=None,
|
| 54 |
+
kerb_kwargs=None,
|
| 55 |
+
data_proxy=None,
|
| 56 |
+
use_https=False,
|
| 57 |
+
session_cert=None,
|
| 58 |
+
session_verify=True,
|
| 59 |
+
**kwargs,
|
| 60 |
+
):
|
| 61 |
+
"""
|
| 62 |
+
Parameters
|
| 63 |
+
----------
|
| 64 |
+
host: str
|
| 65 |
+
Name-node address
|
| 66 |
+
port: int
|
| 67 |
+
Port for webHDFS
|
| 68 |
+
kerberos: bool
|
| 69 |
+
Whether to authenticate with kerberos for this connection
|
| 70 |
+
token: str or None
|
| 71 |
+
If given, use this token on every call to authenticate. A user
|
| 72 |
+
and user-proxy may be encoded in the token and should not be also
|
| 73 |
+
given
|
| 74 |
+
user: str or None
|
| 75 |
+
If given, assert the user name to connect with
|
| 76 |
+
password: str or None
|
| 77 |
+
If given, assert the password to use for basic auth. If password
|
| 78 |
+
is provided, user must be provided also
|
| 79 |
+
proxy_to: str or None
|
| 80 |
+
If given, the user has the authority to proxy, and this value is
|
| 81 |
+
the user in who's name actions are taken
|
| 82 |
+
kerb_kwargs: dict
|
| 83 |
+
Any extra arguments for HTTPKerberosAuth, see
|
| 84 |
+
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
|
| 85 |
+
data_proxy: dict, callable or None
|
| 86 |
+
If given, map data-node addresses. This can be necessary if the
|
| 87 |
+
HDFS cluster is behind a proxy, running on Docker or otherwise has
|
| 88 |
+
a mismatch between the host-names given by the name-node and the
|
| 89 |
+
address by which to refer to them from the client. If a dict,
|
| 90 |
+
maps host names ``host->data_proxy[host]``; if a callable, full
|
| 91 |
+
URLs are passed, and function must conform to
|
| 92 |
+
``url->data_proxy(url)``.
|
| 93 |
+
use_https: bool
|
| 94 |
+
Whether to connect to the Name-node using HTTPS instead of HTTP
|
| 95 |
+
session_cert: str or Tuple[str, str] or None
|
| 96 |
+
Path to a certificate file, or tuple of (cert, key) files to use
|
| 97 |
+
for the requests.Session
|
| 98 |
+
session_verify: str, bool or None
|
| 99 |
+
Path to a certificate file to use for verifying the requests.Session.
|
| 100 |
+
kwargs
|
| 101 |
+
"""
|
| 102 |
+
if self._cached:
|
| 103 |
+
return
|
| 104 |
+
super().__init__(**kwargs)
|
| 105 |
+
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
|
| 106 |
+
self.kerb = kerberos
|
| 107 |
+
self.kerb_kwargs = kerb_kwargs or {}
|
| 108 |
+
self.pars = {}
|
| 109 |
+
self.proxy = data_proxy or {}
|
| 110 |
+
if token is not None:
|
| 111 |
+
if user is not None or proxy_to is not None:
|
| 112 |
+
raise ValueError(
|
| 113 |
+
"If passing a delegation token, must not set "
|
| 114 |
+
"user or proxy_to, as these are encoded in the"
|
| 115 |
+
" token"
|
| 116 |
+
)
|
| 117 |
+
self.pars["delegation"] = token
|
| 118 |
+
self.user = user
|
| 119 |
+
self.password = password
|
| 120 |
+
|
| 121 |
+
if password is not None:
|
| 122 |
+
if user is None:
|
| 123 |
+
raise ValueError(
|
| 124 |
+
"If passing a password, the user must also be"
|
| 125 |
+
"set in order to set up the basic-auth"
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
if user is not None:
|
| 129 |
+
self.pars["user.name"] = user
|
| 130 |
+
|
| 131 |
+
if proxy_to is not None:
|
| 132 |
+
self.pars["doas"] = proxy_to
|
| 133 |
+
if kerberos and user is not None:
|
| 134 |
+
raise ValueError(
|
| 135 |
+
"If using Kerberos auth, do not specify the "
|
| 136 |
+
"user, this is handled by kinit."
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
self.session_cert = session_cert
|
| 140 |
+
self.session_verify = session_verify
|
| 141 |
+
|
| 142 |
+
self._connect()
|
| 143 |
+
|
| 144 |
+
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
| 145 |
+
|
| 146 |
+
@property
|
| 147 |
+
def fsid(self):
|
| 148 |
+
return self._fsid
|
| 149 |
+
|
| 150 |
+
def _connect(self):
|
| 151 |
+
self.session = requests.Session()
|
| 152 |
+
|
| 153 |
+
if self.session_cert:
|
| 154 |
+
self.session.cert = self.session_cert
|
| 155 |
+
|
| 156 |
+
self.session.verify = self.session_verify
|
| 157 |
+
|
| 158 |
+
if self.kerb:
|
| 159 |
+
from requests_kerberos import HTTPKerberosAuth
|
| 160 |
+
|
| 161 |
+
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
| 162 |
+
|
| 163 |
+
if self.user is not None and self.password is not None:
|
| 164 |
+
from requests.auth import HTTPBasicAuth
|
| 165 |
+
|
| 166 |
+
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
| 167 |
+
|
| 168 |
+
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
| 169 |
+
path = self._strip_protocol(path) if path is not None else ""
|
| 170 |
+
url = self._apply_proxy(self.url + quote(path, safe="/="))
|
| 171 |
+
args = kwargs.copy()
|
| 172 |
+
args.update(self.pars)
|
| 173 |
+
args["op"] = op.upper()
|
| 174 |
+
logger.debug("sending %s with %s", url, method)
|
| 175 |
+
out = self.session.request(
|
| 176 |
+
method=method.upper(),
|
| 177 |
+
url=url,
|
| 178 |
+
params=args,
|
| 179 |
+
data=data,
|
| 180 |
+
allow_redirects=redirect,
|
| 181 |
+
)
|
| 182 |
+
if out.status_code in [400, 401, 403, 404, 500]:
|
| 183 |
+
try:
|
| 184 |
+
err = out.json()
|
| 185 |
+
msg = err["RemoteException"]["message"]
|
| 186 |
+
exp = err["RemoteException"]["exception"]
|
| 187 |
+
except (ValueError, KeyError):
|
| 188 |
+
pass
|
| 189 |
+
else:
|
| 190 |
+
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
|
| 191 |
+
raise ValueError(msg)
|
| 192 |
+
elif exp in ["SecurityException", "AccessControlException"]:
|
| 193 |
+
raise PermissionError(msg)
|
| 194 |
+
elif exp in ["FileNotFoundException"]:
|
| 195 |
+
raise FileNotFoundError(msg)
|
| 196 |
+
else:
|
| 197 |
+
raise RuntimeError(msg)
|
| 198 |
+
out.raise_for_status()
|
| 199 |
+
return out
|
| 200 |
+
|
| 201 |
+
def _open(
|
| 202 |
+
self,
|
| 203 |
+
path,
|
| 204 |
+
mode="rb",
|
| 205 |
+
block_size=None,
|
| 206 |
+
autocommit=True,
|
| 207 |
+
replication=None,
|
| 208 |
+
permissions=None,
|
| 209 |
+
**kwargs,
|
| 210 |
+
):
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
Parameters
|
| 214 |
+
----------
|
| 215 |
+
path: str
|
| 216 |
+
File location
|
| 217 |
+
mode: str
|
| 218 |
+
'rb', 'wb', etc.
|
| 219 |
+
block_size: int
|
| 220 |
+
Client buffer size for read-ahead or write buffer
|
| 221 |
+
autocommit: bool
|
| 222 |
+
If False, writes to temporary file that only gets put in final
|
| 223 |
+
location upon commit
|
| 224 |
+
replication: int
|
| 225 |
+
Number of copies of file on the cluster, write mode only
|
| 226 |
+
permissions: str or int
|
| 227 |
+
posix permissions, write mode only
|
| 228 |
+
kwargs
|
| 229 |
+
|
| 230 |
+
Returns
|
| 231 |
+
-------
|
| 232 |
+
WebHDFile instance
|
| 233 |
+
"""
|
| 234 |
+
block_size = block_size or self.blocksize
|
| 235 |
+
return WebHDFile(
|
| 236 |
+
self,
|
| 237 |
+
path,
|
| 238 |
+
mode=mode,
|
| 239 |
+
block_size=block_size,
|
| 240 |
+
tempdir=self.tempdir,
|
| 241 |
+
autocommit=autocommit,
|
| 242 |
+
replication=replication,
|
| 243 |
+
permissions=permissions,
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
@staticmethod
|
| 247 |
+
def _process_info(info):
|
| 248 |
+
info["type"] = info["type"].lower()
|
| 249 |
+
info["size"] = info["length"]
|
| 250 |
+
return info
|
| 251 |
+
|
| 252 |
+
@classmethod
|
| 253 |
+
def _strip_protocol(cls, path):
|
| 254 |
+
return infer_storage_options(path)["path"]
|
| 255 |
+
|
| 256 |
+
@staticmethod
|
| 257 |
+
def _get_kwargs_from_urls(urlpath):
|
| 258 |
+
out = infer_storage_options(urlpath)
|
| 259 |
+
out.pop("path", None)
|
| 260 |
+
out.pop("protocol", None)
|
| 261 |
+
if "username" in out:
|
| 262 |
+
out["user"] = out.pop("username")
|
| 263 |
+
return out
|
| 264 |
+
|
| 265 |
+
def info(self, path):
|
| 266 |
+
out = self._call("GETFILESTATUS", path=path)
|
| 267 |
+
info = out.json()["FileStatus"]
|
| 268 |
+
info["name"] = path
|
| 269 |
+
return self._process_info(info)
|
| 270 |
+
|
| 271 |
+
def ls(self, path, detail=False):
|
| 272 |
+
out = self._call("LISTSTATUS", path=path)
|
| 273 |
+
infos = out.json()["FileStatuses"]["FileStatus"]
|
| 274 |
+
for info in infos:
|
| 275 |
+
self._process_info(info)
|
| 276 |
+
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
|
| 277 |
+
if detail:
|
| 278 |
+
return sorted(infos, key=lambda i: i["name"])
|
| 279 |
+
else:
|
| 280 |
+
return sorted(info["name"] for info in infos)
|
| 281 |
+
|
| 282 |
+
def content_summary(self, path):
|
| 283 |
+
"""Total numbers of files, directories and bytes under path"""
|
| 284 |
+
out = self._call("GETCONTENTSUMMARY", path=path)
|
| 285 |
+
return out.json()["ContentSummary"]
|
| 286 |
+
|
| 287 |
+
def ukey(self, path):
|
| 288 |
+
"""Checksum info of file, giving method and result"""
|
| 289 |
+
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
|
| 290 |
+
if "Location" in out.headers:
|
| 291 |
+
location = self._apply_proxy(out.headers["Location"])
|
| 292 |
+
out2 = self.session.get(location)
|
| 293 |
+
out2.raise_for_status()
|
| 294 |
+
return out2.json()["FileChecksum"]
|
| 295 |
+
else:
|
| 296 |
+
out.raise_for_status()
|
| 297 |
+
return out.json()["FileChecksum"]
|
| 298 |
+
|
| 299 |
+
def home_directory(self):
|
| 300 |
+
"""Get user's home directory"""
|
| 301 |
+
out = self._call("GETHOMEDIRECTORY")
|
| 302 |
+
return out.json()["Path"]
|
| 303 |
+
|
| 304 |
+
def get_delegation_token(self, renewer=None):
|
| 305 |
+
"""Retrieve token which can give the same authority to other uses
|
| 306 |
+
|
| 307 |
+
Parameters
|
| 308 |
+
----------
|
| 309 |
+
renewer: str or None
|
| 310 |
+
User who may use this token; if None, will be current user
|
| 311 |
+
"""
|
| 312 |
+
if renewer:
|
| 313 |
+
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
|
| 314 |
+
else:
|
| 315 |
+
out = self._call("GETDELEGATIONTOKEN")
|
| 316 |
+
t = out.json()["Token"]
|
| 317 |
+
if t is None:
|
| 318 |
+
raise ValueError("No token available for this user/security context")
|
| 319 |
+
return t["urlString"]
|
| 320 |
+
|
| 321 |
+
def renew_delegation_token(self, token):
|
| 322 |
+
"""Make token live longer. Returns new expiry time"""
|
| 323 |
+
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
|
| 324 |
+
return out.json()["long"]
|
| 325 |
+
|
| 326 |
+
def cancel_delegation_token(self, token):
|
| 327 |
+
"""Stop the token from being useful"""
|
| 328 |
+
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
|
| 329 |
+
|
| 330 |
+
def chmod(self, path, mod):
|
| 331 |
+
"""Set the permission at path
|
| 332 |
+
|
| 333 |
+
Parameters
|
| 334 |
+
----------
|
| 335 |
+
path: str
|
| 336 |
+
location to set (file or directory)
|
| 337 |
+
mod: str or int
|
| 338 |
+
posix epresentation or permission, give as oct string, e.g, '777'
|
| 339 |
+
or 0o777
|
| 340 |
+
"""
|
| 341 |
+
self._call("SETPERMISSION", method="put", path=path, permission=mod)
|
| 342 |
+
|
| 343 |
+
def chown(self, path, owner=None, group=None):
|
| 344 |
+
"""Change owning user and/or group"""
|
| 345 |
+
kwargs = {}
|
| 346 |
+
if owner is not None:
|
| 347 |
+
kwargs["owner"] = owner
|
| 348 |
+
if group is not None:
|
| 349 |
+
kwargs["group"] = group
|
| 350 |
+
self._call("SETOWNER", method="put", path=path, **kwargs)
|
| 351 |
+
|
| 352 |
+
def set_replication(self, path, replication):
|
| 353 |
+
"""
|
| 354 |
+
Set file replication factor
|
| 355 |
+
|
| 356 |
+
Parameters
|
| 357 |
+
----------
|
| 358 |
+
path: str
|
| 359 |
+
File location (not for directories)
|
| 360 |
+
replication: int
|
| 361 |
+
Number of copies of file on the cluster. Should be smaller than
|
| 362 |
+
number of data nodes; normally 3 on most systems.
|
| 363 |
+
"""
|
| 364 |
+
self._call("SETREPLICATION", path=path, method="put", replication=replication)
|
| 365 |
+
|
| 366 |
+
def mkdir(self, path, **kwargs):
|
| 367 |
+
self._call("MKDIRS", method="put", path=path)
|
| 368 |
+
|
| 369 |
+
def makedirs(self, path, exist_ok=False):
|
| 370 |
+
if exist_ok is False and self.exists(path):
|
| 371 |
+
raise FileExistsError(path)
|
| 372 |
+
self.mkdir(path)
|
| 373 |
+
|
| 374 |
+
def mv(self, path1, path2, **kwargs):
|
| 375 |
+
self._call("RENAME", method="put", path=path1, destination=path2)
|
| 376 |
+
|
| 377 |
+
def rm(self, path, recursive=False, **kwargs):
|
| 378 |
+
self._call(
|
| 379 |
+
"DELETE",
|
| 380 |
+
method="delete",
|
| 381 |
+
path=path,
|
| 382 |
+
recursive="true" if recursive else "false",
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
def rm_file(self, path, **kwargs):
|
| 386 |
+
self.rm(path)
|
| 387 |
+
|
| 388 |
+
def cp_file(self, lpath, rpath, **kwargs):
|
| 389 |
+
with self.open(lpath) as lstream:
|
| 390 |
+
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
|
| 391 |
+
# Perform an atomic copy (stream to a temporary file and
|
| 392 |
+
# move it to the actual destination).
|
| 393 |
+
try:
|
| 394 |
+
with self.open(tmp_fname, "wb") as rstream:
|
| 395 |
+
shutil.copyfileobj(lstream, rstream)
|
| 396 |
+
self.mv(tmp_fname, rpath)
|
| 397 |
+
except BaseException:
|
| 398 |
+
with suppress(FileNotFoundError):
|
| 399 |
+
self.rm(tmp_fname)
|
| 400 |
+
raise
|
| 401 |
+
|
| 402 |
+
def _apply_proxy(self, location):
|
| 403 |
+
if self.proxy and callable(self.proxy):
|
| 404 |
+
location = self.proxy(location)
|
| 405 |
+
elif self.proxy:
|
| 406 |
+
# as a dict
|
| 407 |
+
for k, v in self.proxy.items():
|
| 408 |
+
location = location.replace(k, v, 1)
|
| 409 |
+
return location
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
class WebHDFile(AbstractBufferedFile):
|
| 413 |
+
"""A file living in HDFS over webHDFS"""
|
| 414 |
+
|
| 415 |
+
def __init__(self, fs, path, **kwargs):
|
| 416 |
+
super().__init__(fs, path, **kwargs)
|
| 417 |
+
kwargs = kwargs.copy()
|
| 418 |
+
if kwargs.get("permissions", None) is None:
|
| 419 |
+
kwargs.pop("permissions", None)
|
| 420 |
+
if kwargs.get("replication", None) is None:
|
| 421 |
+
kwargs.pop("replication", None)
|
| 422 |
+
self.permissions = kwargs.pop("permissions", 511)
|
| 423 |
+
tempdir = kwargs.pop("tempdir")
|
| 424 |
+
if kwargs.pop("autocommit", False) is False:
|
| 425 |
+
self.target = self.path
|
| 426 |
+
self.path = os.path.join(tempdir, str(uuid.uuid4()))
|
| 427 |
+
|
| 428 |
+
def _upload_chunk(self, final=False):
|
| 429 |
+
"""Write one part of a multi-block file upload
|
| 430 |
+
|
| 431 |
+
Parameters
|
| 432 |
+
==========
|
| 433 |
+
final: bool
|
| 434 |
+
This is the last block, so should complete file, if
|
| 435 |
+
self.autocommit is True.
|
| 436 |
+
"""
|
| 437 |
+
out = self.fs.session.post(
|
| 438 |
+
self.location,
|
| 439 |
+
data=self.buffer.getvalue(),
|
| 440 |
+
headers={"content-type": "application/octet-stream"},
|
| 441 |
+
)
|
| 442 |
+
out.raise_for_status()
|
| 443 |
+
return True
|
| 444 |
+
|
| 445 |
+
def _initiate_upload(self):
|
| 446 |
+
"""Create remote file/upload"""
|
| 447 |
+
kwargs = self.kwargs.copy()
|
| 448 |
+
if "a" in self.mode:
|
| 449 |
+
op, method = "APPEND", "POST"
|
| 450 |
+
else:
|
| 451 |
+
op, method = "CREATE", "PUT"
|
| 452 |
+
kwargs["overwrite"] = "true"
|
| 453 |
+
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
|
| 454 |
+
location = self.fs._apply_proxy(out.headers["Location"])
|
| 455 |
+
if "w" in self.mode:
|
| 456 |
+
# create empty file to append to
|
| 457 |
+
out2 = self.fs.session.put(
|
| 458 |
+
location, headers={"content-type": "application/octet-stream"}
|
| 459 |
+
)
|
| 460 |
+
out2.raise_for_status()
|
| 461 |
+
# after creating empty file, change location to append to
|
| 462 |
+
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
|
| 463 |
+
self.location = self.fs._apply_proxy(out2.headers["Location"])
|
| 464 |
+
|
| 465 |
+
def _fetch_range(self, start, end):
|
| 466 |
+
start = max(start, 0)
|
| 467 |
+
end = min(self.size, end)
|
| 468 |
+
if start >= end or start >= self.size:
|
| 469 |
+
return b""
|
| 470 |
+
out = self.fs._call(
|
| 471 |
+
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
|
| 472 |
+
)
|
| 473 |
+
out.raise_for_status()
|
| 474 |
+
if "Location" in out.headers:
|
| 475 |
+
location = out.headers["Location"]
|
| 476 |
+
out2 = self.fs.session.get(self.fs._apply_proxy(location))
|
| 477 |
+
return out2.content
|
| 478 |
+
else:
|
| 479 |
+
return out.content
|
| 480 |
+
|
| 481 |
+
def commit(self):
|
| 482 |
+
self.fs.mv(self.path, self.target)
|
| 483 |
+
|
| 484 |
+
def discard(self):
|
| 485 |
+
self.fs.rm(self.path)
|
.venv/lib/python3.11/site-packages/fsspec/json.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from contextlib import suppress
|
| 3 |
+
from pathlib import PurePath
|
| 4 |
+
from typing import (
|
| 5 |
+
Any,
|
| 6 |
+
Callable,
|
| 7 |
+
ClassVar,
|
| 8 |
+
Dict,
|
| 9 |
+
List,
|
| 10 |
+
Mapping,
|
| 11 |
+
Optional,
|
| 12 |
+
Sequence,
|
| 13 |
+
Tuple,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
from .registry import _import_class, get_filesystem_class
|
| 17 |
+
from .spec import AbstractFileSystem
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class FilesystemJSONEncoder(json.JSONEncoder):
|
| 21 |
+
include_password: ClassVar[bool] = True
|
| 22 |
+
|
| 23 |
+
def default(self, o: Any) -> Any:
|
| 24 |
+
if isinstance(o, AbstractFileSystem):
|
| 25 |
+
return o.to_dict(include_password=self.include_password)
|
| 26 |
+
if isinstance(o, PurePath):
|
| 27 |
+
cls = type(o)
|
| 28 |
+
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
|
| 29 |
+
|
| 30 |
+
return super().default(o)
|
| 31 |
+
|
| 32 |
+
def make_serializable(self, obj: Any) -> Any:
|
| 33 |
+
"""
|
| 34 |
+
Recursively converts an object so that it can be JSON serialized via
|
| 35 |
+
:func:`json.dumps` and :func:`json.dump`, without actually calling
|
| 36 |
+
said functions.
|
| 37 |
+
"""
|
| 38 |
+
if isinstance(obj, (str, int, float, bool)):
|
| 39 |
+
return obj
|
| 40 |
+
if isinstance(obj, Mapping):
|
| 41 |
+
return {k: self.make_serializable(v) for k, v in obj.items()}
|
| 42 |
+
if isinstance(obj, Sequence):
|
| 43 |
+
return [self.make_serializable(v) for v in obj]
|
| 44 |
+
|
| 45 |
+
return self.default(obj)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class FilesystemJSONDecoder(json.JSONDecoder):
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
*,
|
| 52 |
+
object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
|
| 53 |
+
parse_float: Optional[Callable[[str], Any]] = None,
|
| 54 |
+
parse_int: Optional[Callable[[str], Any]] = None,
|
| 55 |
+
parse_constant: Optional[Callable[[str], Any]] = None,
|
| 56 |
+
strict: bool = True,
|
| 57 |
+
object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
|
| 58 |
+
) -> None:
|
| 59 |
+
self.original_object_hook = object_hook
|
| 60 |
+
|
| 61 |
+
super().__init__(
|
| 62 |
+
object_hook=self.custom_object_hook,
|
| 63 |
+
parse_float=parse_float,
|
| 64 |
+
parse_int=parse_int,
|
| 65 |
+
parse_constant=parse_constant,
|
| 66 |
+
strict=strict,
|
| 67 |
+
object_pairs_hook=object_pairs_hook,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
@classmethod
|
| 71 |
+
def try_resolve_path_cls(cls, dct: Dict[str, Any]):
|
| 72 |
+
with suppress(Exception):
|
| 73 |
+
fqp = dct["cls"]
|
| 74 |
+
|
| 75 |
+
path_cls = _import_class(fqp)
|
| 76 |
+
|
| 77 |
+
if issubclass(path_cls, PurePath):
|
| 78 |
+
return path_cls
|
| 79 |
+
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
@classmethod
|
| 83 |
+
def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
|
| 84 |
+
with suppress(Exception):
|
| 85 |
+
if "cls" in dct:
|
| 86 |
+
try:
|
| 87 |
+
fs_cls = _import_class(dct["cls"])
|
| 88 |
+
if issubclass(fs_cls, AbstractFileSystem):
|
| 89 |
+
return fs_cls
|
| 90 |
+
except Exception:
|
| 91 |
+
if "protocol" in dct: # Fallback if cls cannot be imported
|
| 92 |
+
return get_filesystem_class(dct["protocol"])
|
| 93 |
+
|
| 94 |
+
raise
|
| 95 |
+
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
def custom_object_hook(self, dct: Dict[str, Any]):
|
| 99 |
+
if "cls" in dct:
|
| 100 |
+
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
|
| 101 |
+
return AbstractFileSystem.from_dict(dct)
|
| 102 |
+
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
|
| 103 |
+
return obj_cls(dct["str"])
|
| 104 |
+
|
| 105 |
+
if self.original_object_hook is not None:
|
| 106 |
+
return self.original_object_hook(dct)
|
| 107 |
+
|
| 108 |
+
return dct
|
| 109 |
+
|
| 110 |
+
def unmake_serializable(self, obj: Any) -> Any:
|
| 111 |
+
"""
|
| 112 |
+
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
|
| 113 |
+
"""
|
| 114 |
+
if isinstance(obj, dict):
|
| 115 |
+
obj = self.custom_object_hook(obj)
|
| 116 |
+
if isinstance(obj, dict):
|
| 117 |
+
return {k: self.unmake_serializable(v) for k, v in obj.items()}
|
| 118 |
+
if isinstance(obj, (list, tuple)):
|
| 119 |
+
return [self.unmake_serializable(v) for v in obj]
|
| 120 |
+
|
| 121 |
+
return obj
|
.venv/lib/python3.11/site-packages/fsspec/mapping.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import array
|
| 2 |
+
import logging
|
| 3 |
+
import posixpath
|
| 4 |
+
import warnings
|
| 5 |
+
from collections.abc import MutableMapping
|
| 6 |
+
from functools import cached_property
|
| 7 |
+
|
| 8 |
+
from fsspec.core import url_to_fs
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger("fsspec.mapping")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class FSMap(MutableMapping):
|
| 14 |
+
"""Wrap a FileSystem instance as a mutable wrapping.
|
| 15 |
+
|
| 16 |
+
The keys of the mapping become files under the given root, and the
|
| 17 |
+
values (which must be bytes) the contents of those files.
|
| 18 |
+
|
| 19 |
+
Parameters
|
| 20 |
+
----------
|
| 21 |
+
root: string
|
| 22 |
+
prefix for all the files
|
| 23 |
+
fs: FileSystem instance
|
| 24 |
+
check: bool (=True)
|
| 25 |
+
performs a touch at the location, to check for write access.
|
| 26 |
+
|
| 27 |
+
Examples
|
| 28 |
+
--------
|
| 29 |
+
>>> fs = FileSystem(**parameters) # doctest: +SKIP
|
| 30 |
+
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
|
| 31 |
+
or, more likely
|
| 32 |
+
>>> d = fs.get_mapper('my-data/path/')
|
| 33 |
+
|
| 34 |
+
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
|
| 35 |
+
>>> list(d.keys()) # doctest: +SKIP
|
| 36 |
+
['loc1']
|
| 37 |
+
>>> d['loc1'] # doctest: +SKIP
|
| 38 |
+
b'Hello World'
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
|
| 42 |
+
self.fs = fs
|
| 43 |
+
self.root = fs._strip_protocol(root)
|
| 44 |
+
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
|
| 45 |
+
if missing_exceptions is None:
|
| 46 |
+
missing_exceptions = (
|
| 47 |
+
FileNotFoundError,
|
| 48 |
+
IsADirectoryError,
|
| 49 |
+
NotADirectoryError,
|
| 50 |
+
)
|
| 51 |
+
self.missing_exceptions = missing_exceptions
|
| 52 |
+
self.check = check
|
| 53 |
+
self.create = create
|
| 54 |
+
if create:
|
| 55 |
+
if not self.fs.exists(root):
|
| 56 |
+
self.fs.mkdir(root)
|
| 57 |
+
if check:
|
| 58 |
+
if not self.fs.exists(root):
|
| 59 |
+
raise ValueError(
|
| 60 |
+
f"Path {root} does not exist. Create "
|
| 61 |
+
f" with the ``create=True`` keyword"
|
| 62 |
+
)
|
| 63 |
+
self.fs.touch(root + "/a")
|
| 64 |
+
self.fs.rm(root + "/a")
|
| 65 |
+
|
| 66 |
+
@cached_property
|
| 67 |
+
def dirfs(self):
|
| 68 |
+
"""dirfs instance that can be used with the same keys as the mapper"""
|
| 69 |
+
from .implementations.dirfs import DirFileSystem
|
| 70 |
+
|
| 71 |
+
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
|
| 72 |
+
|
| 73 |
+
def clear(self):
|
| 74 |
+
"""Remove all keys below root - empties out mapping"""
|
| 75 |
+
logger.info("Clear mapping at %s", self.root)
|
| 76 |
+
try:
|
| 77 |
+
self.fs.rm(self.root, True)
|
| 78 |
+
self.fs.mkdir(self.root)
|
| 79 |
+
except: # noqa: E722
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
def getitems(self, keys, on_error="raise"):
|
| 83 |
+
"""Fetch multiple items from the store
|
| 84 |
+
|
| 85 |
+
If the backend is async-able, this might proceed concurrently
|
| 86 |
+
|
| 87 |
+
Parameters
|
| 88 |
+
----------
|
| 89 |
+
keys: list(str)
|
| 90 |
+
They keys to be fetched
|
| 91 |
+
on_error : "raise", "omit", "return"
|
| 92 |
+
If raise, an underlying exception will be raised (converted to KeyError
|
| 93 |
+
if the type is in self.missing_exceptions); if omit, keys with exception
|
| 94 |
+
will simply not be included in the output; if "return", all keys are
|
| 95 |
+
included in the output, but the value will be bytes or an exception
|
| 96 |
+
instance.
|
| 97 |
+
|
| 98 |
+
Returns
|
| 99 |
+
-------
|
| 100 |
+
dict(key, bytes|exception)
|
| 101 |
+
"""
|
| 102 |
+
keys2 = [self._key_to_str(k) for k in keys]
|
| 103 |
+
oe = on_error if on_error == "raise" else "return"
|
| 104 |
+
try:
|
| 105 |
+
out = self.fs.cat(keys2, on_error=oe)
|
| 106 |
+
if isinstance(out, bytes):
|
| 107 |
+
out = {keys2[0]: out}
|
| 108 |
+
except self.missing_exceptions as e:
|
| 109 |
+
raise KeyError from e
|
| 110 |
+
out = {
|
| 111 |
+
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
|
| 112 |
+
for k, v in out.items()
|
| 113 |
+
}
|
| 114 |
+
return {
|
| 115 |
+
key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
|
| 116 |
+
for key, k2 in zip(keys, keys2)
|
| 117 |
+
if on_error == "return" or not isinstance(out[k2], BaseException)
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
def setitems(self, values_dict):
|
| 121 |
+
"""Set the values of multiple items in the store
|
| 122 |
+
|
| 123 |
+
Parameters
|
| 124 |
+
----------
|
| 125 |
+
values_dict: dict(str, bytes)
|
| 126 |
+
"""
|
| 127 |
+
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
|
| 128 |
+
self.fs.pipe(values)
|
| 129 |
+
|
| 130 |
+
def delitems(self, keys):
|
| 131 |
+
"""Remove multiple keys from the store"""
|
| 132 |
+
self.fs.rm([self._key_to_str(k) for k in keys])
|
| 133 |
+
|
| 134 |
+
def _key_to_str(self, key):
|
| 135 |
+
"""Generate full path for the key"""
|
| 136 |
+
if not isinstance(key, str):
|
| 137 |
+
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
|
| 138 |
+
warnings.warn(
|
| 139 |
+
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
|
| 140 |
+
DeprecationWarning,
|
| 141 |
+
)
|
| 142 |
+
if isinstance(key, list):
|
| 143 |
+
key = tuple(key)
|
| 144 |
+
key = str(key)
|
| 145 |
+
return f"{self._root_key_to_str}{key}".rstrip("/")
|
| 146 |
+
|
| 147 |
+
def _str_to_key(self, s):
|
| 148 |
+
"""Strip path of to leave key name"""
|
| 149 |
+
return s[len(self.root) :].lstrip("/")
|
| 150 |
+
|
| 151 |
+
def __getitem__(self, key, default=None):
|
| 152 |
+
"""Retrieve data"""
|
| 153 |
+
k = self._key_to_str(key)
|
| 154 |
+
try:
|
| 155 |
+
result = self.fs.cat(k)
|
| 156 |
+
except self.missing_exceptions as exc:
|
| 157 |
+
if default is not None:
|
| 158 |
+
return default
|
| 159 |
+
raise KeyError(key) from exc
|
| 160 |
+
return result
|
| 161 |
+
|
| 162 |
+
def pop(self, key, default=None):
|
| 163 |
+
"""Pop data"""
|
| 164 |
+
result = self.__getitem__(key, default)
|
| 165 |
+
try:
|
| 166 |
+
del self[key]
|
| 167 |
+
except KeyError:
|
| 168 |
+
pass
|
| 169 |
+
return result
|
| 170 |
+
|
| 171 |
+
def __setitem__(self, key, value):
|
| 172 |
+
"""Store value in key"""
|
| 173 |
+
key = self._key_to_str(key)
|
| 174 |
+
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
|
| 175 |
+
self.fs.pipe_file(key, maybe_convert(value))
|
| 176 |
+
|
| 177 |
+
def __iter__(self):
|
| 178 |
+
return (self._str_to_key(x) for x in self.fs.find(self.root))
|
| 179 |
+
|
| 180 |
+
def __len__(self):
|
| 181 |
+
return len(self.fs.find(self.root))
|
| 182 |
+
|
| 183 |
+
def __delitem__(self, key):
|
| 184 |
+
"""Remove key"""
|
| 185 |
+
try:
|
| 186 |
+
self.fs.rm(self._key_to_str(key))
|
| 187 |
+
except Exception as exc:
|
| 188 |
+
raise KeyError from exc
|
| 189 |
+
|
| 190 |
+
def __contains__(self, key):
|
| 191 |
+
"""Does key exist in mapping?"""
|
| 192 |
+
path = self._key_to_str(key)
|
| 193 |
+
return self.fs.isfile(path)
|
| 194 |
+
|
| 195 |
+
def __reduce__(self):
|
| 196 |
+
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def maybe_convert(value):
|
| 200 |
+
if isinstance(value, array.array) or hasattr(value, "__array__"):
|
| 201 |
+
# bytes-like things
|
| 202 |
+
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
|
| 203 |
+
# The buffer interface doesn't support datetime64/timdelta64 numpy
|
| 204 |
+
# arrays
|
| 205 |
+
value = value.view("int64")
|
| 206 |
+
value = bytes(memoryview(value))
|
| 207 |
+
return value
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def get_mapper(
|
| 211 |
+
url="",
|
| 212 |
+
check=False,
|
| 213 |
+
create=False,
|
| 214 |
+
missing_exceptions=None,
|
| 215 |
+
alternate_root=None,
|
| 216 |
+
**kwargs,
|
| 217 |
+
):
|
| 218 |
+
"""Create key-value interface for given URL and options
|
| 219 |
+
|
| 220 |
+
The URL will be of the form "protocol://location" and point to the root
|
| 221 |
+
of the mapper required. All keys will be file-names below this location,
|
| 222 |
+
and their values the contents of each key.
|
| 223 |
+
|
| 224 |
+
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
|
| 225 |
+
|
| 226 |
+
Parameters
|
| 227 |
+
----------
|
| 228 |
+
url: str
|
| 229 |
+
Root URL of mapping
|
| 230 |
+
check: bool
|
| 231 |
+
Whether to attempt to read from the location before instantiation, to
|
| 232 |
+
check that the mapping does exist
|
| 233 |
+
create: bool
|
| 234 |
+
Whether to make the directory corresponding to the root before
|
| 235 |
+
instantiating
|
| 236 |
+
missing_exceptions: None or tuple
|
| 237 |
+
If given, these exception types will be regarded as missing keys and
|
| 238 |
+
return KeyError when trying to read data. By default, you get
|
| 239 |
+
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
|
| 240 |
+
alternate_root: None or str
|
| 241 |
+
In cases of complex URLs, the parser may fail to pick the correct part
|
| 242 |
+
for the mapper root, so this arg can override
|
| 243 |
+
|
| 244 |
+
Returns
|
| 245 |
+
-------
|
| 246 |
+
``FSMap`` instance, the dict-like key-value store.
|
| 247 |
+
"""
|
| 248 |
+
# Removing protocol here - could defer to each open() on the backend
|
| 249 |
+
fs, urlpath = url_to_fs(url, **kwargs)
|
| 250 |
+
root = alternate_root if alternate_root is not None else urlpath
|
| 251 |
+
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
|
.venv/lib/python3.11/site-packages/fsspec/parquet.py
ADDED
|
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import json
|
| 3 |
+
import warnings
|
| 4 |
+
|
| 5 |
+
from .core import url_to_fs
|
| 6 |
+
from .utils import merge_offset_ranges
|
| 7 |
+
|
| 8 |
+
# Parquet-Specific Utilities for fsspec
|
| 9 |
+
#
|
| 10 |
+
# Most of the functions defined in this module are NOT
|
| 11 |
+
# intended for public consumption. The only exception
|
| 12 |
+
# to this is `open_parquet_file`, which should be used
|
| 13 |
+
# place of `fs.open()` to open parquet-formatted files
|
| 14 |
+
# on remote file systems.
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def open_parquet_file(
|
| 18 |
+
path,
|
| 19 |
+
mode="rb",
|
| 20 |
+
fs=None,
|
| 21 |
+
metadata=None,
|
| 22 |
+
columns=None,
|
| 23 |
+
row_groups=None,
|
| 24 |
+
storage_options=None,
|
| 25 |
+
strict=False,
|
| 26 |
+
engine="auto",
|
| 27 |
+
max_gap=64_000,
|
| 28 |
+
max_block=256_000_000,
|
| 29 |
+
footer_sample_size=1_000_000,
|
| 30 |
+
**kwargs,
|
| 31 |
+
):
|
| 32 |
+
"""
|
| 33 |
+
Return a file-like object for a single Parquet file.
|
| 34 |
+
|
| 35 |
+
The specified parquet `engine` will be used to parse the
|
| 36 |
+
footer metadata, and determine the required byte ranges
|
| 37 |
+
from the file. The target path will then be opened with
|
| 38 |
+
the "parts" (`KnownPartsOfAFile`) caching strategy.
|
| 39 |
+
|
| 40 |
+
Note that this method is intended for usage with remote
|
| 41 |
+
file systems, and is unlikely to improve parquet-read
|
| 42 |
+
performance on local file systems.
|
| 43 |
+
|
| 44 |
+
Parameters
|
| 45 |
+
----------
|
| 46 |
+
path: str
|
| 47 |
+
Target file path.
|
| 48 |
+
mode: str, optional
|
| 49 |
+
Mode option to be passed through to `fs.open`. Default is "rb".
|
| 50 |
+
metadata: Any, optional
|
| 51 |
+
Parquet metadata object. Object type must be supported
|
| 52 |
+
by the backend parquet engine. For now, only the "fastparquet"
|
| 53 |
+
engine supports an explicit `ParquetFile` metadata object.
|
| 54 |
+
If a metadata object is supplied, the remote footer metadata
|
| 55 |
+
will not need to be transferred into local memory.
|
| 56 |
+
fs: AbstractFileSystem, optional
|
| 57 |
+
Filesystem object to use for opening the file. If nothing is
|
| 58 |
+
specified, an `AbstractFileSystem` object will be inferred.
|
| 59 |
+
engine : str, default "auto"
|
| 60 |
+
Parquet engine to use for metadata parsing. Allowed options
|
| 61 |
+
include "fastparquet", "pyarrow", and "auto". The specified
|
| 62 |
+
engine must be installed in the current environment. If
|
| 63 |
+
"auto" is specified, and both engines are installed,
|
| 64 |
+
"fastparquet" will take precedence over "pyarrow".
|
| 65 |
+
columns: list, optional
|
| 66 |
+
List of all column names that may be read from the file.
|
| 67 |
+
row_groups : list, optional
|
| 68 |
+
List of all row-groups that may be read from the file. This
|
| 69 |
+
may be a list of row-group indices (integers), or it may be
|
| 70 |
+
a list of `RowGroup` metadata objects (if the "fastparquet"
|
| 71 |
+
engine is used).
|
| 72 |
+
storage_options : dict, optional
|
| 73 |
+
Used to generate an `AbstractFileSystem` object if `fs` was
|
| 74 |
+
not specified.
|
| 75 |
+
strict : bool, optional
|
| 76 |
+
Whether the resulting `KnownPartsOfAFile` cache should
|
| 77 |
+
fetch reads that go beyond a known byte-range boundary.
|
| 78 |
+
If `False` (the default), any read that ends outside a
|
| 79 |
+
known part will be zero padded. Note that using
|
| 80 |
+
`strict=True` may be useful for debugging.
|
| 81 |
+
max_gap : int, optional
|
| 82 |
+
Neighboring byte ranges will only be merged when their
|
| 83 |
+
inter-range gap is <= `max_gap`. Default is 64KB.
|
| 84 |
+
max_block : int, optional
|
| 85 |
+
Neighboring byte ranges will only be merged when the size of
|
| 86 |
+
the aggregated range is <= `max_block`. Default is 256MB.
|
| 87 |
+
footer_sample_size : int, optional
|
| 88 |
+
Number of bytes to read from the end of the path to look
|
| 89 |
+
for the footer metadata. If the sampled bytes do not contain
|
| 90 |
+
the footer, a second read request will be required, and
|
| 91 |
+
performance will suffer. Default is 1MB.
|
| 92 |
+
**kwargs :
|
| 93 |
+
Optional key-word arguments to pass to `fs.open`
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
# Make sure we have an `AbstractFileSystem` object
|
| 97 |
+
# to work with
|
| 98 |
+
if fs is None:
|
| 99 |
+
fs = url_to_fs(path, **(storage_options or {}))[0]
|
| 100 |
+
|
| 101 |
+
# For now, `columns == []` not supported. Just use
|
| 102 |
+
# default `open` command with `path` input
|
| 103 |
+
if columns is not None and len(columns) == 0:
|
| 104 |
+
return fs.open(path, mode=mode)
|
| 105 |
+
|
| 106 |
+
# Set the engine
|
| 107 |
+
engine = _set_engine(engine)
|
| 108 |
+
|
| 109 |
+
# Fetch the known byte ranges needed to read
|
| 110 |
+
# `columns` and/or `row_groups`
|
| 111 |
+
data = _get_parquet_byte_ranges(
|
| 112 |
+
[path],
|
| 113 |
+
fs,
|
| 114 |
+
metadata=metadata,
|
| 115 |
+
columns=columns,
|
| 116 |
+
row_groups=row_groups,
|
| 117 |
+
engine=engine,
|
| 118 |
+
max_gap=max_gap,
|
| 119 |
+
max_block=max_block,
|
| 120 |
+
footer_sample_size=footer_sample_size,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Extract file name from `data`
|
| 124 |
+
fn = next(iter(data)) if data else path
|
| 125 |
+
|
| 126 |
+
# Call self.open with "parts" caching
|
| 127 |
+
options = kwargs.pop("cache_options", {}).copy()
|
| 128 |
+
return fs.open(
|
| 129 |
+
fn,
|
| 130 |
+
mode=mode,
|
| 131 |
+
cache_type="parts",
|
| 132 |
+
cache_options={
|
| 133 |
+
**options,
|
| 134 |
+
"data": data.get(fn, {}),
|
| 135 |
+
"strict": strict,
|
| 136 |
+
},
|
| 137 |
+
**kwargs,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def _get_parquet_byte_ranges(
|
| 142 |
+
paths,
|
| 143 |
+
fs,
|
| 144 |
+
metadata=None,
|
| 145 |
+
columns=None,
|
| 146 |
+
row_groups=None,
|
| 147 |
+
max_gap=64_000,
|
| 148 |
+
max_block=256_000_000,
|
| 149 |
+
footer_sample_size=1_000_000,
|
| 150 |
+
engine="auto",
|
| 151 |
+
):
|
| 152 |
+
"""Get a dictionary of the known byte ranges needed
|
| 153 |
+
to read a specific column/row-group selection from a
|
| 154 |
+
Parquet dataset. Each value in the output dictionary
|
| 155 |
+
is intended for use as the `data` argument for the
|
| 156 |
+
`KnownPartsOfAFile` caching strategy of a single path.
|
| 157 |
+
"""
|
| 158 |
+
|
| 159 |
+
# Set engine if necessary
|
| 160 |
+
if isinstance(engine, str):
|
| 161 |
+
engine = _set_engine(engine)
|
| 162 |
+
|
| 163 |
+
# Pass to specialized function if metadata is defined
|
| 164 |
+
if metadata is not None:
|
| 165 |
+
# Use the provided parquet metadata object
|
| 166 |
+
# to avoid transferring/parsing footer metadata
|
| 167 |
+
return _get_parquet_byte_ranges_from_metadata(
|
| 168 |
+
metadata,
|
| 169 |
+
fs,
|
| 170 |
+
engine,
|
| 171 |
+
columns=columns,
|
| 172 |
+
row_groups=row_groups,
|
| 173 |
+
max_gap=max_gap,
|
| 174 |
+
max_block=max_block,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Get file sizes asynchronously
|
| 178 |
+
file_sizes = fs.sizes(paths)
|
| 179 |
+
|
| 180 |
+
# Populate global paths, starts, & ends
|
| 181 |
+
result = {}
|
| 182 |
+
data_paths = []
|
| 183 |
+
data_starts = []
|
| 184 |
+
data_ends = []
|
| 185 |
+
add_header_magic = True
|
| 186 |
+
if columns is None and row_groups is None:
|
| 187 |
+
# We are NOT selecting specific columns or row-groups.
|
| 188 |
+
#
|
| 189 |
+
# We can avoid sampling the footers, and just transfer
|
| 190 |
+
# all file data with cat_ranges
|
| 191 |
+
for i, path in enumerate(paths):
|
| 192 |
+
result[path] = {}
|
| 193 |
+
for b in range(0, file_sizes[i], max_block):
|
| 194 |
+
data_paths.append(path)
|
| 195 |
+
data_starts.append(b)
|
| 196 |
+
data_ends.append(min(b + max_block, file_sizes[i]))
|
| 197 |
+
add_header_magic = False # "Magic" should already be included
|
| 198 |
+
else:
|
| 199 |
+
# We ARE selecting specific columns or row-groups.
|
| 200 |
+
#
|
| 201 |
+
# Gather file footers.
|
| 202 |
+
# We just take the last `footer_sample_size` bytes of each
|
| 203 |
+
# file (or the entire file if it is smaller than that)
|
| 204 |
+
footer_starts = []
|
| 205 |
+
footer_ends = []
|
| 206 |
+
for i, path in enumerate(paths):
|
| 207 |
+
footer_ends.append(file_sizes[i])
|
| 208 |
+
sample_size = max(0, file_sizes[i] - footer_sample_size)
|
| 209 |
+
footer_starts.append(sample_size)
|
| 210 |
+
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
|
| 211 |
+
|
| 212 |
+
# Check our footer samples and re-sample if necessary.
|
| 213 |
+
missing_footer_starts = footer_starts.copy()
|
| 214 |
+
large_footer = 0
|
| 215 |
+
for i, path in enumerate(paths):
|
| 216 |
+
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
|
| 217 |
+
real_footer_start = file_sizes[i] - (footer_size + 8)
|
| 218 |
+
if real_footer_start < footer_starts[i]:
|
| 219 |
+
missing_footer_starts[i] = real_footer_start
|
| 220 |
+
large_footer = max(large_footer, (footer_size + 8))
|
| 221 |
+
if large_footer:
|
| 222 |
+
warnings.warn(
|
| 223 |
+
f"Not enough data was used to sample the parquet footer. "
|
| 224 |
+
f"Try setting footer_sample_size >= {large_footer}."
|
| 225 |
+
)
|
| 226 |
+
for i, block in enumerate(
|
| 227 |
+
fs.cat_ranges(
|
| 228 |
+
paths,
|
| 229 |
+
missing_footer_starts,
|
| 230 |
+
footer_starts,
|
| 231 |
+
)
|
| 232 |
+
):
|
| 233 |
+
footer_samples[i] = block + footer_samples[i]
|
| 234 |
+
footer_starts[i] = missing_footer_starts[i]
|
| 235 |
+
|
| 236 |
+
# Calculate required byte ranges for each path
|
| 237 |
+
for i, path in enumerate(paths):
|
| 238 |
+
# Deal with small-file case.
|
| 239 |
+
# Just include all remaining bytes of the file
|
| 240 |
+
# in a single range.
|
| 241 |
+
if file_sizes[i] < max_block:
|
| 242 |
+
if footer_starts[i] > 0:
|
| 243 |
+
# Only need to transfer the data if the
|
| 244 |
+
# footer sample isn't already the whole file
|
| 245 |
+
data_paths.append(path)
|
| 246 |
+
data_starts.append(0)
|
| 247 |
+
data_ends.append(footer_starts[i])
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
# Use "engine" to collect data byte ranges
|
| 251 |
+
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
| 252 |
+
columns,
|
| 253 |
+
row_groups=row_groups,
|
| 254 |
+
footer=footer_samples[i],
|
| 255 |
+
footer_start=footer_starts[i],
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
data_paths += [path] * len(path_data_starts)
|
| 259 |
+
data_starts += path_data_starts
|
| 260 |
+
data_ends += path_data_ends
|
| 261 |
+
|
| 262 |
+
# Merge adjacent offset ranges
|
| 263 |
+
data_paths, data_starts, data_ends = merge_offset_ranges(
|
| 264 |
+
data_paths,
|
| 265 |
+
data_starts,
|
| 266 |
+
data_ends,
|
| 267 |
+
max_gap=max_gap,
|
| 268 |
+
max_block=max_block,
|
| 269 |
+
sort=False, # Should already be sorted
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
# Start by populating `result` with footer samples
|
| 273 |
+
for i, path in enumerate(paths):
|
| 274 |
+
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
|
| 275 |
+
|
| 276 |
+
# Transfer the data byte-ranges into local memory
|
| 277 |
+
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
| 278 |
+
|
| 279 |
+
# Add b"PAR1" to header if necessary
|
| 280 |
+
if add_header_magic:
|
| 281 |
+
_add_header_magic(result)
|
| 282 |
+
|
| 283 |
+
return result
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def _get_parquet_byte_ranges_from_metadata(
|
| 287 |
+
metadata,
|
| 288 |
+
fs,
|
| 289 |
+
engine,
|
| 290 |
+
columns=None,
|
| 291 |
+
row_groups=None,
|
| 292 |
+
max_gap=64_000,
|
| 293 |
+
max_block=256_000_000,
|
| 294 |
+
):
|
| 295 |
+
"""Simplified version of `_get_parquet_byte_ranges` for
|
| 296 |
+
the case that an engine-specific `metadata` object is
|
| 297 |
+
provided, and the remote footer metadata does not need to
|
| 298 |
+
be transferred before calculating the required byte ranges.
|
| 299 |
+
"""
|
| 300 |
+
|
| 301 |
+
# Use "engine" to collect data byte ranges
|
| 302 |
+
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
| 303 |
+
columns,
|
| 304 |
+
row_groups=row_groups,
|
| 305 |
+
metadata=metadata,
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Merge adjacent offset ranges
|
| 309 |
+
data_paths, data_starts, data_ends = merge_offset_ranges(
|
| 310 |
+
data_paths,
|
| 311 |
+
data_starts,
|
| 312 |
+
data_ends,
|
| 313 |
+
max_gap=max_gap,
|
| 314 |
+
max_block=max_block,
|
| 315 |
+
sort=False, # Should be sorted
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Transfer the data byte-ranges into local memory
|
| 319 |
+
result = {fn: {} for fn in list(set(data_paths))}
|
| 320 |
+
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
| 321 |
+
|
| 322 |
+
# Add b"PAR1" to header
|
| 323 |
+
_add_header_magic(result)
|
| 324 |
+
|
| 325 |
+
return result
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def _transfer_ranges(fs, blocks, paths, starts, ends):
|
| 329 |
+
# Use cat_ranges to gather the data byte_ranges
|
| 330 |
+
ranges = (paths, starts, ends)
|
| 331 |
+
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
|
| 332 |
+
blocks[path][(start, stop)] = data
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def _add_header_magic(data):
|
| 336 |
+
# Add b"PAR1" to file headers
|
| 337 |
+
for path in list(data.keys()):
|
| 338 |
+
add_magic = True
|
| 339 |
+
for k in data[path]:
|
| 340 |
+
if k[0] == 0 and k[1] >= 4:
|
| 341 |
+
add_magic = False
|
| 342 |
+
break
|
| 343 |
+
if add_magic:
|
| 344 |
+
data[path][(0, 4)] = b"PAR1"
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def _set_engine(engine_str):
|
| 348 |
+
# Define a list of parquet engines to try
|
| 349 |
+
if engine_str == "auto":
|
| 350 |
+
try_engines = ("fastparquet", "pyarrow")
|
| 351 |
+
elif not isinstance(engine_str, str):
|
| 352 |
+
raise ValueError(
|
| 353 |
+
"Failed to set parquet engine! "
|
| 354 |
+
"Please pass 'fastparquet', 'pyarrow', or 'auto'"
|
| 355 |
+
)
|
| 356 |
+
elif engine_str not in ("fastparquet", "pyarrow"):
|
| 357 |
+
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
|
| 358 |
+
else:
|
| 359 |
+
try_engines = [engine_str]
|
| 360 |
+
|
| 361 |
+
# Try importing the engines in `try_engines`,
|
| 362 |
+
# and choose the first one that succeeds
|
| 363 |
+
for engine in try_engines:
|
| 364 |
+
try:
|
| 365 |
+
if engine == "fastparquet":
|
| 366 |
+
return FastparquetEngine()
|
| 367 |
+
elif engine == "pyarrow":
|
| 368 |
+
return PyarrowEngine()
|
| 369 |
+
except ImportError:
|
| 370 |
+
pass
|
| 371 |
+
|
| 372 |
+
# Raise an error if a supported parquet engine
|
| 373 |
+
# was not found
|
| 374 |
+
raise ImportError(
|
| 375 |
+
f"The following parquet engines are not installed "
|
| 376 |
+
f"in your python environment: {try_engines}."
|
| 377 |
+
f"Please install 'fastparquert' or 'pyarrow' to "
|
| 378 |
+
f"utilize the `fsspec.parquet` module."
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
class FastparquetEngine:
|
| 383 |
+
# The purpose of the FastparquetEngine class is
|
| 384 |
+
# to check if fastparquet can be imported (on initialization)
|
| 385 |
+
# and to define a `_parquet_byte_ranges` method. In the
|
| 386 |
+
# future, this class may also be used to define other
|
| 387 |
+
# methods/logic that are specific to fastparquet.
|
| 388 |
+
|
| 389 |
+
def __init__(self):
|
| 390 |
+
import fastparquet as fp
|
| 391 |
+
|
| 392 |
+
self.fp = fp
|
| 393 |
+
|
| 394 |
+
def _row_group_filename(self, row_group, pf):
|
| 395 |
+
return pf.row_group_filename(row_group)
|
| 396 |
+
|
| 397 |
+
def _parquet_byte_ranges(
|
| 398 |
+
self,
|
| 399 |
+
columns,
|
| 400 |
+
row_groups=None,
|
| 401 |
+
metadata=None,
|
| 402 |
+
footer=None,
|
| 403 |
+
footer_start=None,
|
| 404 |
+
):
|
| 405 |
+
# Initialize offset ranges and define ParqetFile metadata
|
| 406 |
+
pf = metadata
|
| 407 |
+
data_paths, data_starts, data_ends = [], [], []
|
| 408 |
+
if pf is None:
|
| 409 |
+
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
| 410 |
+
|
| 411 |
+
# Convert columns to a set and add any index columns
|
| 412 |
+
# specified in the pandas metadata (just in case)
|
| 413 |
+
column_set = None if columns is None else set(columns)
|
| 414 |
+
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
| 415 |
+
md_index = [
|
| 416 |
+
ind
|
| 417 |
+
for ind in pf.pandas_metadata.get("index_columns", [])
|
| 418 |
+
# Ignore RangeIndex information
|
| 419 |
+
if not isinstance(ind, dict)
|
| 420 |
+
]
|
| 421 |
+
column_set |= set(md_index)
|
| 422 |
+
|
| 423 |
+
# Check if row_groups is a list of integers
|
| 424 |
+
# or a list of row-group metadata
|
| 425 |
+
if row_groups and not isinstance(row_groups[0], int):
|
| 426 |
+
# Input row_groups contains row-group metadata
|
| 427 |
+
row_group_indices = None
|
| 428 |
+
else:
|
| 429 |
+
# Input row_groups contains row-group indices
|
| 430 |
+
row_group_indices = row_groups
|
| 431 |
+
row_groups = pf.row_groups
|
| 432 |
+
|
| 433 |
+
# Loop through column chunks to add required byte ranges
|
| 434 |
+
for r, row_group in enumerate(row_groups):
|
| 435 |
+
# Skip this row-group if we are targeting
|
| 436 |
+
# specific row-groups
|
| 437 |
+
if row_group_indices is None or r in row_group_indices:
|
| 438 |
+
# Find the target parquet-file path for `row_group`
|
| 439 |
+
fn = self._row_group_filename(row_group, pf)
|
| 440 |
+
|
| 441 |
+
for column in row_group.columns:
|
| 442 |
+
name = column.meta_data.path_in_schema[0]
|
| 443 |
+
# Skip this column if we are targeting a
|
| 444 |
+
# specific columns
|
| 445 |
+
if column_set is None or name in column_set:
|
| 446 |
+
file_offset0 = column.meta_data.dictionary_page_offset
|
| 447 |
+
if file_offset0 is None:
|
| 448 |
+
file_offset0 = column.meta_data.data_page_offset
|
| 449 |
+
num_bytes = column.meta_data.total_compressed_size
|
| 450 |
+
if footer_start is None or file_offset0 < footer_start:
|
| 451 |
+
data_paths.append(fn)
|
| 452 |
+
data_starts.append(file_offset0)
|
| 453 |
+
data_ends.append(
|
| 454 |
+
min(
|
| 455 |
+
file_offset0 + num_bytes,
|
| 456 |
+
footer_start or (file_offset0 + num_bytes),
|
| 457 |
+
)
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
if metadata:
|
| 461 |
+
# The metadata in this call may map to multiple
|
| 462 |
+
# file paths. Need to include `data_paths`
|
| 463 |
+
return data_paths, data_starts, data_ends
|
| 464 |
+
return data_starts, data_ends
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
class PyarrowEngine:
|
| 468 |
+
# The purpose of the PyarrowEngine class is
|
| 469 |
+
# to check if pyarrow can be imported (on initialization)
|
| 470 |
+
# and to define a `_parquet_byte_ranges` method. In the
|
| 471 |
+
# future, this class may also be used to define other
|
| 472 |
+
# methods/logic that are specific to pyarrow.
|
| 473 |
+
|
| 474 |
+
def __init__(self):
|
| 475 |
+
import pyarrow.parquet as pq
|
| 476 |
+
|
| 477 |
+
self.pq = pq
|
| 478 |
+
|
| 479 |
+
def _row_group_filename(self, row_group, metadata):
|
| 480 |
+
raise NotImplementedError
|
| 481 |
+
|
| 482 |
+
def _parquet_byte_ranges(
|
| 483 |
+
self,
|
| 484 |
+
columns,
|
| 485 |
+
row_groups=None,
|
| 486 |
+
metadata=None,
|
| 487 |
+
footer=None,
|
| 488 |
+
footer_start=None,
|
| 489 |
+
):
|
| 490 |
+
if metadata is not None:
|
| 491 |
+
raise ValueError("metadata input not supported for PyarrowEngine")
|
| 492 |
+
|
| 493 |
+
data_starts, data_ends = [], []
|
| 494 |
+
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
| 495 |
+
|
| 496 |
+
# Convert columns to a set and add any index columns
|
| 497 |
+
# specified in the pandas metadata (just in case)
|
| 498 |
+
column_set = None if columns is None else set(columns)
|
| 499 |
+
if column_set is not None:
|
| 500 |
+
schema = md.schema.to_arrow_schema()
|
| 501 |
+
has_pandas_metadata = (
|
| 502 |
+
schema.metadata is not None and b"pandas" in schema.metadata
|
| 503 |
+
)
|
| 504 |
+
if has_pandas_metadata:
|
| 505 |
+
md_index = [
|
| 506 |
+
ind
|
| 507 |
+
for ind in json.loads(
|
| 508 |
+
schema.metadata[b"pandas"].decode("utf8")
|
| 509 |
+
).get("index_columns", [])
|
| 510 |
+
# Ignore RangeIndex information
|
| 511 |
+
if not isinstance(ind, dict)
|
| 512 |
+
]
|
| 513 |
+
column_set |= set(md_index)
|
| 514 |
+
|
| 515 |
+
# Loop through column chunks to add required byte ranges
|
| 516 |
+
for r in range(md.num_row_groups):
|
| 517 |
+
# Skip this row-group if we are targeting
|
| 518 |
+
# specific row-groups
|
| 519 |
+
if row_groups is None or r in row_groups:
|
| 520 |
+
row_group = md.row_group(r)
|
| 521 |
+
for c in range(row_group.num_columns):
|
| 522 |
+
column = row_group.column(c)
|
| 523 |
+
name = column.path_in_schema
|
| 524 |
+
# Skip this column if we are targeting a
|
| 525 |
+
# specific columns
|
| 526 |
+
split_name = name.split(".")[0]
|
| 527 |
+
if (
|
| 528 |
+
column_set is None
|
| 529 |
+
or name in column_set
|
| 530 |
+
or split_name in column_set
|
| 531 |
+
):
|
| 532 |
+
file_offset0 = column.dictionary_page_offset
|
| 533 |
+
if file_offset0 is None:
|
| 534 |
+
file_offset0 = column.data_page_offset
|
| 535 |
+
num_bytes = column.total_compressed_size
|
| 536 |
+
if file_offset0 < footer_start:
|
| 537 |
+
data_starts.append(file_offset0)
|
| 538 |
+
data_ends.append(
|
| 539 |
+
min(file_offset0 + num_bytes, footer_start)
|
| 540 |
+
)
|
| 541 |
+
return data_starts, data_ends
|
.venv/lib/python3.11/site-packages/fsspec/registry.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
import types
|
| 5 |
+
import warnings
|
| 6 |
+
|
| 7 |
+
__all__ = ["registry", "get_filesystem_class", "default"]
|
| 8 |
+
|
| 9 |
+
# internal, mutable
|
| 10 |
+
_registry: dict[str, type] = {}
|
| 11 |
+
|
| 12 |
+
# external, immutable
|
| 13 |
+
registry = types.MappingProxyType(_registry)
|
| 14 |
+
default = "file"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def register_implementation(name, cls, clobber=False, errtxt=None):
|
| 18 |
+
"""Add implementation class to the registry
|
| 19 |
+
|
| 20 |
+
Parameters
|
| 21 |
+
----------
|
| 22 |
+
name: str
|
| 23 |
+
Protocol name to associate with the class
|
| 24 |
+
cls: class or str
|
| 25 |
+
if a class: fsspec-compliant implementation class (normally inherits from
|
| 26 |
+
``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
|
| 27 |
+
str, the full path to an implementation class like package.module.class,
|
| 28 |
+
which gets added to known_implementations,
|
| 29 |
+
so the import is deferred until the filesystem is actually used.
|
| 30 |
+
clobber: bool (optional)
|
| 31 |
+
Whether to overwrite a protocol with the same name; if False, will raise
|
| 32 |
+
instead.
|
| 33 |
+
errtxt: str (optional)
|
| 34 |
+
If given, then a failure to import the given class will result in this
|
| 35 |
+
text being given.
|
| 36 |
+
"""
|
| 37 |
+
if isinstance(cls, str):
|
| 38 |
+
if name in known_implementations and clobber is False:
|
| 39 |
+
if cls != known_implementations[name]["class"]:
|
| 40 |
+
raise ValueError(
|
| 41 |
+
f"Name ({name}) already in the known_implementations and clobber "
|
| 42 |
+
f"is False"
|
| 43 |
+
)
|
| 44 |
+
else:
|
| 45 |
+
known_implementations[name] = {
|
| 46 |
+
"class": cls,
|
| 47 |
+
"err": errtxt or f"{cls} import failed for protocol {name}",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
else:
|
| 51 |
+
if name in registry and clobber is False:
|
| 52 |
+
if _registry[name] is not cls:
|
| 53 |
+
raise ValueError(
|
| 54 |
+
f"Name ({name}) already in the registry and clobber is False"
|
| 55 |
+
)
|
| 56 |
+
else:
|
| 57 |
+
_registry[name] = cls
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# protocols mapped to the class which implements them. This dict can be
|
| 61 |
+
# updated with register_implementation
|
| 62 |
+
known_implementations = {
|
| 63 |
+
"abfs": {
|
| 64 |
+
"class": "adlfs.AzureBlobFileSystem",
|
| 65 |
+
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
| 66 |
+
},
|
| 67 |
+
"adl": {
|
| 68 |
+
"class": "adlfs.AzureDatalakeFileSystem",
|
| 69 |
+
"err": "Install adlfs to access Azure Datalake Gen1",
|
| 70 |
+
},
|
| 71 |
+
"arrow_hdfs": {
|
| 72 |
+
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
| 73 |
+
"err": "pyarrow and local java libraries required for HDFS",
|
| 74 |
+
},
|
| 75 |
+
"asynclocal": {
|
| 76 |
+
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
| 77 |
+
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
| 78 |
+
},
|
| 79 |
+
"az": {
|
| 80 |
+
"class": "adlfs.AzureBlobFileSystem",
|
| 81 |
+
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
| 82 |
+
},
|
| 83 |
+
"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
| 84 |
+
"box": {
|
| 85 |
+
"class": "boxfs.BoxFileSystem",
|
| 86 |
+
"err": "Please install boxfs to access BoxFileSystem",
|
| 87 |
+
},
|
| 88 |
+
"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
| 89 |
+
"dask": {
|
| 90 |
+
"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
|
| 91 |
+
"err": "Install dask distributed to access worker file system",
|
| 92 |
+
},
|
| 93 |
+
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
|
| 94 |
+
"dbfs": {
|
| 95 |
+
"class": "fsspec.implementations.dbfs.DatabricksFileSystem",
|
| 96 |
+
"err": "Install the requests package to use the DatabricksFileSystem",
|
| 97 |
+
},
|
| 98 |
+
"dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
|
| 99 |
+
"dropbox": {
|
| 100 |
+
"class": "dropboxdrivefs.DropboxDriveFileSystem",
|
| 101 |
+
"err": (
|
| 102 |
+
'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
|
| 103 |
+
'"dropbox" to be installed'
|
| 104 |
+
),
|
| 105 |
+
},
|
| 106 |
+
"dvc": {
|
| 107 |
+
"class": "dvc.api.DVCFileSystem",
|
| 108 |
+
"err": "Install dvc to access DVCFileSystem",
|
| 109 |
+
},
|
| 110 |
+
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
| 111 |
+
"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
|
| 112 |
+
"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
|
| 113 |
+
"gcs": {
|
| 114 |
+
"class": "gcsfs.GCSFileSystem",
|
| 115 |
+
"err": "Please install gcsfs to access Google Storage",
|
| 116 |
+
},
|
| 117 |
+
"gdrive": {
|
| 118 |
+
"class": "gdrivefs.GoogleDriveFileSystem",
|
| 119 |
+
"err": "Please install gdrivefs for access to Google Drive",
|
| 120 |
+
},
|
| 121 |
+
"generic": {"class": "fsspec.generic.GenericFileSystem"},
|
| 122 |
+
"git": {
|
| 123 |
+
"class": "fsspec.implementations.git.GitFileSystem",
|
| 124 |
+
"err": "Install pygit2 to browse local git repos",
|
| 125 |
+
},
|
| 126 |
+
"github": {
|
| 127 |
+
"class": "fsspec.implementations.github.GithubFileSystem",
|
| 128 |
+
"err": "Install the requests package to use the github FS",
|
| 129 |
+
},
|
| 130 |
+
"gs": {
|
| 131 |
+
"class": "gcsfs.GCSFileSystem",
|
| 132 |
+
"err": "Please install gcsfs to access Google Storage",
|
| 133 |
+
},
|
| 134 |
+
"hdfs": {
|
| 135 |
+
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
| 136 |
+
"err": "pyarrow and local java libraries required for HDFS",
|
| 137 |
+
},
|
| 138 |
+
"hf": {
|
| 139 |
+
"class": "huggingface_hub.HfFileSystem",
|
| 140 |
+
"err": "Install huggingface_hub to access HfFileSystem",
|
| 141 |
+
},
|
| 142 |
+
"http": {
|
| 143 |
+
"class": "fsspec.implementations.http.HTTPFileSystem",
|
| 144 |
+
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
| 145 |
+
},
|
| 146 |
+
"https": {
|
| 147 |
+
"class": "fsspec.implementations.http.HTTPFileSystem",
|
| 148 |
+
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
| 149 |
+
},
|
| 150 |
+
"jlab": {
|
| 151 |
+
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
| 152 |
+
"err": "Jupyter FS requires requests to be installed",
|
| 153 |
+
},
|
| 154 |
+
"jupyter": {
|
| 155 |
+
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
| 156 |
+
"err": "Jupyter FS requires requests to be installed",
|
| 157 |
+
},
|
| 158 |
+
"lakefs": {
|
| 159 |
+
"class": "lakefs_spec.LakeFSFileSystem",
|
| 160 |
+
"err": "Please install lakefs-spec to access LakeFSFileSystem",
|
| 161 |
+
},
|
| 162 |
+
"libarchive": {
|
| 163 |
+
"class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
|
| 164 |
+
"err": "LibArchive requires to be installed",
|
| 165 |
+
},
|
| 166 |
+
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
| 167 |
+
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
|
| 168 |
+
"oci": {
|
| 169 |
+
"class": "ocifs.OCIFileSystem",
|
| 170 |
+
"err": "Install ocifs to access OCI Object Storage",
|
| 171 |
+
},
|
| 172 |
+
"ocilake": {
|
| 173 |
+
"class": "ocifs.OCIFileSystem",
|
| 174 |
+
"err": "Install ocifs to access OCI Data Lake",
|
| 175 |
+
},
|
| 176 |
+
"oss": {
|
| 177 |
+
"class": "ossfs.OSSFileSystem",
|
| 178 |
+
"err": "Install ossfs to access Alibaba Object Storage System",
|
| 179 |
+
},
|
| 180 |
+
"reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
|
| 181 |
+
"root": {
|
| 182 |
+
"class": "fsspec_xrootd.XRootDFileSystem",
|
| 183 |
+
"err": (
|
| 184 |
+
"Install fsspec-xrootd to access xrootd storage system. "
|
| 185 |
+
"Note: 'root' is the protocol name for xrootd storage systems, "
|
| 186 |
+
"not referring to root directories"
|
| 187 |
+
),
|
| 188 |
+
},
|
| 189 |
+
"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
| 190 |
+
"s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
| 191 |
+
"sftp": {
|
| 192 |
+
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
| 193 |
+
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
| 194 |
+
},
|
| 195 |
+
"simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
|
| 196 |
+
"smb": {
|
| 197 |
+
"class": "fsspec.implementations.smb.SMBFileSystem",
|
| 198 |
+
"err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
|
| 199 |
+
},
|
| 200 |
+
"ssh": {
|
| 201 |
+
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
| 202 |
+
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
| 203 |
+
},
|
| 204 |
+
"tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
|
| 205 |
+
"tosfs": {
|
| 206 |
+
"class": "tosfs.TosFileSystem",
|
| 207 |
+
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
|
| 208 |
+
},
|
| 209 |
+
"wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
|
| 210 |
+
"webdav": {
|
| 211 |
+
"class": "webdav4.fsspec.WebdavFileSystem",
|
| 212 |
+
"err": "Install webdav4 to access WebDAV",
|
| 213 |
+
},
|
| 214 |
+
"webhdfs": {
|
| 215 |
+
"class": "fsspec.implementations.webhdfs.WebHDFS",
|
| 216 |
+
"err": 'webHDFS access requires "requests" to be installed',
|
| 217 |
+
},
|
| 218 |
+
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
assert list(known_implementations) == sorted(known_implementations), (
|
| 222 |
+
"Not in alphabetical order"
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def get_filesystem_class(protocol):
|
| 227 |
+
"""Fetch named protocol implementation from the registry
|
| 228 |
+
|
| 229 |
+
The dict ``known_implementations`` maps protocol names to the locations
|
| 230 |
+
of classes implementing the corresponding file-system. When used for the
|
| 231 |
+
first time, appropriate imports will happen and the class will be placed in
|
| 232 |
+
the registry. All subsequent calls will fetch directly from the registry.
|
| 233 |
+
|
| 234 |
+
Some protocol implementations require additional dependencies, and so the
|
| 235 |
+
import may fail. In this case, the string in the "err" field of the
|
| 236 |
+
``known_implementations`` will be given as the error message.
|
| 237 |
+
"""
|
| 238 |
+
if not protocol:
|
| 239 |
+
protocol = default
|
| 240 |
+
|
| 241 |
+
if protocol not in registry:
|
| 242 |
+
if protocol not in known_implementations:
|
| 243 |
+
raise ValueError(f"Protocol not known: {protocol}")
|
| 244 |
+
bit = known_implementations[protocol]
|
| 245 |
+
try:
|
| 246 |
+
register_implementation(protocol, _import_class(bit["class"]))
|
| 247 |
+
except ImportError as e:
|
| 248 |
+
raise ImportError(bit["err"]) from e
|
| 249 |
+
cls = registry[protocol]
|
| 250 |
+
if getattr(cls, "protocol", None) in ("abstract", None):
|
| 251 |
+
cls.protocol = protocol
|
| 252 |
+
|
| 253 |
+
return cls
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
s3_msg = """Your installed version of s3fs is very old and known to cause
|
| 257 |
+
severe performance issues, see also https://github.com/dask/dask/issues/10276
|
| 258 |
+
|
| 259 |
+
To fix, you should specify a lower version bound on s3fs, or
|
| 260 |
+
update the current installation.
|
| 261 |
+
"""
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _import_class(fqp: str):
|
| 265 |
+
"""Take a fully-qualified path and return the imported class or identifier.
|
| 266 |
+
|
| 267 |
+
``fqp`` is of the form "package.module.klass" or
|
| 268 |
+
"package.module:subobject.klass".
|
| 269 |
+
|
| 270 |
+
Warnings
|
| 271 |
+
--------
|
| 272 |
+
This can import arbitrary modules. Make sure you haven't installed any modules
|
| 273 |
+
that may execute malicious code at import time.
|
| 274 |
+
"""
|
| 275 |
+
if ":" in fqp:
|
| 276 |
+
mod, name = fqp.rsplit(":", 1)
|
| 277 |
+
else:
|
| 278 |
+
mod, name = fqp.rsplit(".", 1)
|
| 279 |
+
|
| 280 |
+
is_s3 = mod == "s3fs"
|
| 281 |
+
mod = importlib.import_module(mod)
|
| 282 |
+
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
|
| 283 |
+
warnings.warn(s3_msg)
|
| 284 |
+
for part in name.split("."):
|
| 285 |
+
mod = getattr(mod, part)
|
| 286 |
+
|
| 287 |
+
if not isinstance(mod, type):
|
| 288 |
+
raise TypeError(f"{fqp} is not a class")
|
| 289 |
+
|
| 290 |
+
return mod
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def filesystem(protocol, **storage_options):
|
| 294 |
+
"""Instantiate filesystems for given protocol and arguments
|
| 295 |
+
|
| 296 |
+
``storage_options`` are specific to the protocol being chosen, and are
|
| 297 |
+
passed directly to the class.
|
| 298 |
+
"""
|
| 299 |
+
if protocol == "arrow_hdfs":
|
| 300 |
+
warnings.warn(
|
| 301 |
+
"The 'arrow_hdfs' protocol has been deprecated and will be "
|
| 302 |
+
"removed in the future. Specify it as 'hdfs'.",
|
| 303 |
+
DeprecationWarning,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
cls = get_filesystem_class(protocol)
|
| 307 |
+
return cls(**storage_options)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def available_protocols():
|
| 311 |
+
"""Return a list of the implemented protocols.
|
| 312 |
+
|
| 313 |
+
Note that any given protocol may require extra packages to be importable.
|
| 314 |
+
"""
|
| 315 |
+
return list(known_implementations)
|
.venv/lib/python3.11/site-packages/fsspec/spec.py
ADDED
|
@@ -0,0 +1,2242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import threading
|
| 8 |
+
import warnings
|
| 9 |
+
import weakref
|
| 10 |
+
from errno import ESPIPE
|
| 11 |
+
from glob import has_magic
|
| 12 |
+
from hashlib import sha256
|
| 13 |
+
from typing import Any, ClassVar
|
| 14 |
+
|
| 15 |
+
from .callbacks import DEFAULT_CALLBACK
|
| 16 |
+
from .config import apply_config, conf
|
| 17 |
+
from .dircache import DirCache
|
| 18 |
+
from .transaction import Transaction
|
| 19 |
+
from .utils import (
|
| 20 |
+
_unstrip_protocol,
|
| 21 |
+
glob_translate,
|
| 22 |
+
isfilelike,
|
| 23 |
+
other_paths,
|
| 24 |
+
read_block,
|
| 25 |
+
stringify_path,
|
| 26 |
+
tokenize,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger("fsspec")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def make_instance(cls, args, kwargs):
|
| 33 |
+
return cls(*args, **kwargs)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class _Cached(type):
|
| 37 |
+
"""
|
| 38 |
+
Metaclass for caching file system instances.
|
| 39 |
+
|
| 40 |
+
Notes
|
| 41 |
+
-----
|
| 42 |
+
Instances are cached according to
|
| 43 |
+
|
| 44 |
+
* The values of the class attributes listed in `_extra_tokenize_attributes`
|
| 45 |
+
* The arguments passed to ``__init__``.
|
| 46 |
+
|
| 47 |
+
This creates an additional reference to the filesystem, which prevents the
|
| 48 |
+
filesystem from being garbage collected when all *user* references go away.
|
| 49 |
+
A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
|
| 50 |
+
be made for a filesystem instance to be garbage collected.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(cls, *args, **kwargs):
|
| 54 |
+
super().__init__(*args, **kwargs)
|
| 55 |
+
# Note: we intentionally create a reference here, to avoid garbage
|
| 56 |
+
# collecting instances when all other references are gone. To really
|
| 57 |
+
# delete a FileSystem, the cache must be cleared.
|
| 58 |
+
if conf.get("weakref_instance_cache"): # pragma: no cover
|
| 59 |
+
# debug option for analysing fork/spawn conditions
|
| 60 |
+
cls._cache = weakref.WeakValueDictionary()
|
| 61 |
+
else:
|
| 62 |
+
cls._cache = {}
|
| 63 |
+
cls._pid = os.getpid()
|
| 64 |
+
|
| 65 |
+
def __call__(cls, *args, **kwargs):
|
| 66 |
+
kwargs = apply_config(cls, kwargs)
|
| 67 |
+
extra_tokens = tuple(
|
| 68 |
+
getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
|
| 69 |
+
)
|
| 70 |
+
token = tokenize(
|
| 71 |
+
cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
|
| 72 |
+
)
|
| 73 |
+
skip = kwargs.pop("skip_instance_cache", False)
|
| 74 |
+
if os.getpid() != cls._pid:
|
| 75 |
+
cls._cache.clear()
|
| 76 |
+
cls._pid = os.getpid()
|
| 77 |
+
if not skip and cls.cachable and token in cls._cache:
|
| 78 |
+
cls._latest = token
|
| 79 |
+
return cls._cache[token]
|
| 80 |
+
else:
|
| 81 |
+
obj = super().__call__(*args, **kwargs)
|
| 82 |
+
# Setting _fs_token here causes some static linters to complain.
|
| 83 |
+
obj._fs_token_ = token
|
| 84 |
+
obj.storage_args = args
|
| 85 |
+
obj.storage_options = kwargs
|
| 86 |
+
if obj.async_impl and obj.mirror_sync_methods:
|
| 87 |
+
from .asyn import mirror_sync_methods
|
| 88 |
+
|
| 89 |
+
mirror_sync_methods(obj)
|
| 90 |
+
|
| 91 |
+
if cls.cachable and not skip:
|
| 92 |
+
cls._latest = token
|
| 93 |
+
cls._cache[token] = obj
|
| 94 |
+
return obj
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class AbstractFileSystem(metaclass=_Cached):
|
| 98 |
+
"""
|
| 99 |
+
An abstract super-class for pythonic file-systems
|
| 100 |
+
|
| 101 |
+
Implementations are expected to be compatible with or, better, subclass
|
| 102 |
+
from here.
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
cachable = True # this class can be cached, instances reused
|
| 106 |
+
_cached = False
|
| 107 |
+
blocksize = 2**22
|
| 108 |
+
sep = "/"
|
| 109 |
+
protocol: ClassVar[str | tuple[str, ...]] = "abstract"
|
| 110 |
+
_latest = None
|
| 111 |
+
async_impl = False
|
| 112 |
+
mirror_sync_methods = False
|
| 113 |
+
root_marker = "" # For some FSs, may require leading '/' or other character
|
| 114 |
+
transaction_type = Transaction
|
| 115 |
+
|
| 116 |
+
#: Extra *class attributes* that should be considered when hashing.
|
| 117 |
+
_extra_tokenize_attributes = ()
|
| 118 |
+
|
| 119 |
+
# Set by _Cached metaclass
|
| 120 |
+
storage_args: tuple[Any, ...]
|
| 121 |
+
storage_options: dict[str, Any]
|
| 122 |
+
|
| 123 |
+
def __init__(self, *args, **storage_options):
|
| 124 |
+
"""Create and configure file-system instance
|
| 125 |
+
|
| 126 |
+
Instances may be cachable, so if similar enough arguments are seen
|
| 127 |
+
a new instance is not required. The token attribute exists to allow
|
| 128 |
+
implementations to cache instances if they wish.
|
| 129 |
+
|
| 130 |
+
A reasonable default should be provided if there are no arguments.
|
| 131 |
+
|
| 132 |
+
Subclasses should call this method.
|
| 133 |
+
|
| 134 |
+
Parameters
|
| 135 |
+
----------
|
| 136 |
+
use_listings_cache, listings_expiry_time, max_paths:
|
| 137 |
+
passed to ``DirCache``, if the implementation supports
|
| 138 |
+
directory listing caching. Pass use_listings_cache=False
|
| 139 |
+
to disable such caching.
|
| 140 |
+
skip_instance_cache: bool
|
| 141 |
+
If this is a cachable implementation, pass True here to force
|
| 142 |
+
creating a new instance even if a matching instance exists, and prevent
|
| 143 |
+
storing this instance.
|
| 144 |
+
asynchronous: bool
|
| 145 |
+
loop: asyncio-compatible IOLoop or None
|
| 146 |
+
"""
|
| 147 |
+
if self._cached:
|
| 148 |
+
# reusing instance, don't change
|
| 149 |
+
return
|
| 150 |
+
self._cached = True
|
| 151 |
+
self._intrans = False
|
| 152 |
+
self._transaction = None
|
| 153 |
+
self._invalidated_caches_in_transaction = []
|
| 154 |
+
self.dircache = DirCache(**storage_options)
|
| 155 |
+
|
| 156 |
+
if storage_options.pop("add_docs", None):
|
| 157 |
+
warnings.warn("add_docs is no longer supported.", FutureWarning)
|
| 158 |
+
|
| 159 |
+
if storage_options.pop("add_aliases", None):
|
| 160 |
+
warnings.warn("add_aliases has been removed.", FutureWarning)
|
| 161 |
+
# This is set in _Cached
|
| 162 |
+
self._fs_token_ = None
|
| 163 |
+
|
| 164 |
+
@property
|
| 165 |
+
def fsid(self):
|
| 166 |
+
"""Persistent filesystem id that can be used to compare filesystems
|
| 167 |
+
across sessions.
|
| 168 |
+
"""
|
| 169 |
+
raise NotImplementedError
|
| 170 |
+
|
| 171 |
+
@property
|
| 172 |
+
def _fs_token(self):
|
| 173 |
+
return self._fs_token_
|
| 174 |
+
|
| 175 |
+
def __dask_tokenize__(self):
|
| 176 |
+
return self._fs_token
|
| 177 |
+
|
| 178 |
+
def __hash__(self):
|
| 179 |
+
return int(self._fs_token, 16)
|
| 180 |
+
|
| 181 |
+
def __eq__(self, other):
|
| 182 |
+
return isinstance(other, type(self)) and self._fs_token == other._fs_token
|
| 183 |
+
|
| 184 |
+
def __reduce__(self):
|
| 185 |
+
return make_instance, (type(self), self.storage_args, self.storage_options)
|
| 186 |
+
|
| 187 |
+
@classmethod
|
| 188 |
+
def _strip_protocol(cls, path):
|
| 189 |
+
"""Turn path from fully-qualified to file-system-specific
|
| 190 |
+
|
| 191 |
+
May require FS-specific handling, e.g., for relative paths or links.
|
| 192 |
+
"""
|
| 193 |
+
if isinstance(path, list):
|
| 194 |
+
return [cls._strip_protocol(p) for p in path]
|
| 195 |
+
path = stringify_path(path)
|
| 196 |
+
protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
|
| 197 |
+
for protocol in protos:
|
| 198 |
+
if path.startswith(protocol + "://"):
|
| 199 |
+
path = path[len(protocol) + 3 :]
|
| 200 |
+
elif path.startswith(protocol + "::"):
|
| 201 |
+
path = path[len(protocol) + 2 :]
|
| 202 |
+
path = path.rstrip("/")
|
| 203 |
+
# use of root_marker to make minimum required path, e.g., "/"
|
| 204 |
+
return path or cls.root_marker
|
| 205 |
+
|
| 206 |
+
def unstrip_protocol(self, name: str) -> str:
|
| 207 |
+
"""Format FS-specific path to generic, including protocol"""
|
| 208 |
+
protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
|
| 209 |
+
for protocol in protos:
|
| 210 |
+
if name.startswith(f"{protocol}://"):
|
| 211 |
+
return name
|
| 212 |
+
return f"{protos[0]}://{name}"
|
| 213 |
+
|
| 214 |
+
@staticmethod
|
| 215 |
+
def _get_kwargs_from_urls(path):
|
| 216 |
+
"""If kwargs can be encoded in the paths, extract them here
|
| 217 |
+
|
| 218 |
+
This should happen before instantiation of the class; incoming paths
|
| 219 |
+
then should be amended to strip the options in methods.
|
| 220 |
+
|
| 221 |
+
Examples may look like an sftp path "sftp://user@host:/my/path", where
|
| 222 |
+
the user and host should become kwargs and later get stripped.
|
| 223 |
+
"""
|
| 224 |
+
# by default, nothing happens
|
| 225 |
+
return {}
|
| 226 |
+
|
| 227 |
+
@classmethod
|
| 228 |
+
def current(cls):
|
| 229 |
+
"""Return the most recently instantiated FileSystem
|
| 230 |
+
|
| 231 |
+
If no instance has been created, then create one with defaults
|
| 232 |
+
"""
|
| 233 |
+
if cls._latest in cls._cache:
|
| 234 |
+
return cls._cache[cls._latest]
|
| 235 |
+
return cls()
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def transaction(self):
|
| 239 |
+
"""A context within which files are committed together upon exit
|
| 240 |
+
|
| 241 |
+
Requires the file class to implement `.commit()` and `.discard()`
|
| 242 |
+
for the normal and exception cases.
|
| 243 |
+
"""
|
| 244 |
+
if self._transaction is None:
|
| 245 |
+
self._transaction = self.transaction_type(self)
|
| 246 |
+
return self._transaction
|
| 247 |
+
|
| 248 |
+
def start_transaction(self):
|
| 249 |
+
"""Begin write transaction for deferring files, non-context version"""
|
| 250 |
+
self._intrans = True
|
| 251 |
+
self._transaction = self.transaction_type(self)
|
| 252 |
+
return self.transaction
|
| 253 |
+
|
| 254 |
+
def end_transaction(self):
|
| 255 |
+
"""Finish write transaction, non-context version"""
|
| 256 |
+
self.transaction.complete()
|
| 257 |
+
self._transaction = None
|
| 258 |
+
# The invalid cache must be cleared after the transaction is completed.
|
| 259 |
+
for path in self._invalidated_caches_in_transaction:
|
| 260 |
+
self.invalidate_cache(path)
|
| 261 |
+
self._invalidated_caches_in_transaction.clear()
|
| 262 |
+
|
| 263 |
+
def invalidate_cache(self, path=None):
|
| 264 |
+
"""
|
| 265 |
+
Discard any cached directory information
|
| 266 |
+
|
| 267 |
+
Parameters
|
| 268 |
+
----------
|
| 269 |
+
path: string or None
|
| 270 |
+
If None, clear all listings cached else listings at or under given
|
| 271 |
+
path.
|
| 272 |
+
"""
|
| 273 |
+
# Not necessary to implement invalidation mechanism, may have no cache.
|
| 274 |
+
# But if have, you should call this method of parent class from your
|
| 275 |
+
# subclass to ensure expiring caches after transacations correctly.
|
| 276 |
+
# See the implementation of FTPFileSystem in ftp.py
|
| 277 |
+
if self._intrans:
|
| 278 |
+
self._invalidated_caches_in_transaction.append(path)
|
| 279 |
+
|
| 280 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
| 281 |
+
"""
|
| 282 |
+
Create directory entry at path
|
| 283 |
+
|
| 284 |
+
For systems that don't have true directories, may create an for
|
| 285 |
+
this instance only and not touch the real filesystem
|
| 286 |
+
|
| 287 |
+
Parameters
|
| 288 |
+
----------
|
| 289 |
+
path: str
|
| 290 |
+
location
|
| 291 |
+
create_parents: bool
|
| 292 |
+
if True, this is equivalent to ``makedirs``
|
| 293 |
+
kwargs:
|
| 294 |
+
may be permissions, etc.
|
| 295 |
+
"""
|
| 296 |
+
pass # not necessary to implement, may not have directories
|
| 297 |
+
|
| 298 |
+
def makedirs(self, path, exist_ok=False):
|
| 299 |
+
"""Recursively make directories
|
| 300 |
+
|
| 301 |
+
Creates directory at path and any intervening required directories.
|
| 302 |
+
Raises exception if, for instance, the path already exists but is a
|
| 303 |
+
file.
|
| 304 |
+
|
| 305 |
+
Parameters
|
| 306 |
+
----------
|
| 307 |
+
path: str
|
| 308 |
+
leaf directory name
|
| 309 |
+
exist_ok: bool (False)
|
| 310 |
+
If False, will error if the target already exists
|
| 311 |
+
"""
|
| 312 |
+
pass # not necessary to implement, may not have directories
|
| 313 |
+
|
| 314 |
+
def rmdir(self, path):
|
| 315 |
+
"""Remove a directory, if empty"""
|
| 316 |
+
pass # not necessary to implement, may not have directories
|
| 317 |
+
|
| 318 |
+
def ls(self, path, detail=True, **kwargs):
|
| 319 |
+
"""List objects at path.
|
| 320 |
+
|
| 321 |
+
This should include subdirectories and files at that location. The
|
| 322 |
+
difference between a file and a directory must be clear when details
|
| 323 |
+
are requested.
|
| 324 |
+
|
| 325 |
+
The specific keys, or perhaps a FileInfo class, or similar, is TBD,
|
| 326 |
+
but must be consistent across implementations.
|
| 327 |
+
Must include:
|
| 328 |
+
|
| 329 |
+
- full path to the entry (without protocol)
|
| 330 |
+
- size of the entry, in bytes. If the value cannot be determined, will
|
| 331 |
+
be ``None``.
|
| 332 |
+
- type of entry, "file", "directory" or other
|
| 333 |
+
|
| 334 |
+
Additional information
|
| 335 |
+
may be present, appropriate to the file-system, e.g., generation,
|
| 336 |
+
checksum, etc.
|
| 337 |
+
|
| 338 |
+
May use refresh=True|False to allow use of self._ls_from_cache to
|
| 339 |
+
check for a saved listing and avoid calling the backend. This would be
|
| 340 |
+
common where listing may be expensive.
|
| 341 |
+
|
| 342 |
+
Parameters
|
| 343 |
+
----------
|
| 344 |
+
path: str
|
| 345 |
+
detail: bool
|
| 346 |
+
if True, gives a list of dictionaries, where each is the same as
|
| 347 |
+
the result of ``info(path)``. If False, gives a list of paths
|
| 348 |
+
(str).
|
| 349 |
+
kwargs: may have additional backend-specific options, such as version
|
| 350 |
+
information
|
| 351 |
+
|
| 352 |
+
Returns
|
| 353 |
+
-------
|
| 354 |
+
List of strings if detail is False, or list of directory information
|
| 355 |
+
dicts if detail is True.
|
| 356 |
+
"""
|
| 357 |
+
raise NotImplementedError
|
| 358 |
+
|
| 359 |
+
def _ls_from_cache(self, path):
|
| 360 |
+
"""Check cache for listing
|
| 361 |
+
|
| 362 |
+
Returns listing, if found (may be empty list for a directly that exists
|
| 363 |
+
but contains nothing), None if not in cache.
|
| 364 |
+
"""
|
| 365 |
+
parent = self._parent(path)
|
| 366 |
+
try:
|
| 367 |
+
return self.dircache[path.rstrip("/")]
|
| 368 |
+
except KeyError:
|
| 369 |
+
pass
|
| 370 |
+
try:
|
| 371 |
+
files = [
|
| 372 |
+
f
|
| 373 |
+
for f in self.dircache[parent]
|
| 374 |
+
if f["name"] == path
|
| 375 |
+
or (f["name"] == path.rstrip("/") and f["type"] == "directory")
|
| 376 |
+
]
|
| 377 |
+
if len(files) == 0:
|
| 378 |
+
# parent dir was listed but did not contain this file
|
| 379 |
+
raise FileNotFoundError(path)
|
| 380 |
+
return files
|
| 381 |
+
except KeyError:
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
+
def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
|
| 385 |
+
"""Return all files under the given path.
|
| 386 |
+
|
| 387 |
+
List all files, recursing into subdirectories; output is iterator-style,
|
| 388 |
+
like ``os.walk()``. For a simple list of files, ``find()`` is available.
|
| 389 |
+
|
| 390 |
+
When topdown is True, the caller can modify the dirnames list in-place (perhaps
|
| 391 |
+
using del or slice assignment), and walk() will
|
| 392 |
+
only recurse into the subdirectories whose names remain in dirnames;
|
| 393 |
+
this can be used to prune the search, impose a specific order of visiting,
|
| 394 |
+
or even to inform walk() about directories the caller creates or renames before
|
| 395 |
+
it resumes walk() again.
|
| 396 |
+
Modifying dirnames when topdown is False has no effect. (see os.walk)
|
| 397 |
+
|
| 398 |
+
Note that the "files" outputted will include anything that is not
|
| 399 |
+
a directory, such as links.
|
| 400 |
+
|
| 401 |
+
Parameters
|
| 402 |
+
----------
|
| 403 |
+
path: str
|
| 404 |
+
Root to recurse into
|
| 405 |
+
maxdepth: int
|
| 406 |
+
Maximum recursion depth. None means limitless, but not recommended
|
| 407 |
+
on link-based file-systems.
|
| 408 |
+
topdown: bool (True)
|
| 409 |
+
Whether to walk the directory tree from the top downwards or from
|
| 410 |
+
the bottom upwards.
|
| 411 |
+
on_error: "omit", "raise", a callable
|
| 412 |
+
if omit (default), path with exception will simply be empty;
|
| 413 |
+
If raise, an underlying exception will be raised;
|
| 414 |
+
if callable, it will be called with a single OSError instance as argument
|
| 415 |
+
kwargs: passed to ``ls``
|
| 416 |
+
"""
|
| 417 |
+
if maxdepth is not None and maxdepth < 1:
|
| 418 |
+
raise ValueError("maxdepth must be at least 1")
|
| 419 |
+
|
| 420 |
+
path = self._strip_protocol(path)
|
| 421 |
+
full_dirs = {}
|
| 422 |
+
dirs = {}
|
| 423 |
+
files = {}
|
| 424 |
+
|
| 425 |
+
detail = kwargs.pop("detail", False)
|
| 426 |
+
try:
|
| 427 |
+
listing = self.ls(path, detail=True, **kwargs)
|
| 428 |
+
except (FileNotFoundError, OSError) as e:
|
| 429 |
+
if on_error == "raise":
|
| 430 |
+
raise
|
| 431 |
+
if callable(on_error):
|
| 432 |
+
on_error(e)
|
| 433 |
+
return
|
| 434 |
+
|
| 435 |
+
for info in listing:
|
| 436 |
+
# each info name must be at least [path]/part , but here
|
| 437 |
+
# we check also for names like [path]/part/
|
| 438 |
+
pathname = info["name"].rstrip("/")
|
| 439 |
+
name = pathname.rsplit("/", 1)[-1]
|
| 440 |
+
if info["type"] == "directory" and pathname != path:
|
| 441 |
+
# do not include "self" path
|
| 442 |
+
full_dirs[name] = pathname
|
| 443 |
+
dirs[name] = info
|
| 444 |
+
elif pathname == path:
|
| 445 |
+
# file-like with same name as give path
|
| 446 |
+
files[""] = info
|
| 447 |
+
else:
|
| 448 |
+
files[name] = info
|
| 449 |
+
|
| 450 |
+
if not detail:
|
| 451 |
+
dirs = list(dirs)
|
| 452 |
+
files = list(files)
|
| 453 |
+
|
| 454 |
+
if topdown:
|
| 455 |
+
# Yield before recursion if walking top down
|
| 456 |
+
yield path, dirs, files
|
| 457 |
+
|
| 458 |
+
if maxdepth is not None:
|
| 459 |
+
maxdepth -= 1
|
| 460 |
+
if maxdepth < 1:
|
| 461 |
+
if not topdown:
|
| 462 |
+
yield path, dirs, files
|
| 463 |
+
return
|
| 464 |
+
|
| 465 |
+
for d in dirs:
|
| 466 |
+
yield from self.walk(
|
| 467 |
+
full_dirs[d],
|
| 468 |
+
maxdepth=maxdepth,
|
| 469 |
+
detail=detail,
|
| 470 |
+
topdown=topdown,
|
| 471 |
+
**kwargs,
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
if not topdown:
|
| 475 |
+
# Yield after recursion if walking bottom up
|
| 476 |
+
yield path, dirs, files
|
| 477 |
+
|
| 478 |
+
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
| 479 |
+
"""List all files below path.
|
| 480 |
+
|
| 481 |
+
Like posix ``find`` command without conditions
|
| 482 |
+
|
| 483 |
+
Parameters
|
| 484 |
+
----------
|
| 485 |
+
path : str
|
| 486 |
+
maxdepth: int or None
|
| 487 |
+
If not None, the maximum number of levels to descend
|
| 488 |
+
withdirs: bool
|
| 489 |
+
Whether to include directory paths in the output. This is True
|
| 490 |
+
when used by glob, but users usually only want files.
|
| 491 |
+
kwargs are passed to ``ls``.
|
| 492 |
+
"""
|
| 493 |
+
# TODO: allow equivalent of -name parameter
|
| 494 |
+
path = self._strip_protocol(path)
|
| 495 |
+
out = {}
|
| 496 |
+
|
| 497 |
+
# Add the root directory if withdirs is requested
|
| 498 |
+
# This is needed for posix glob compliance
|
| 499 |
+
if withdirs and path != "" and self.isdir(path):
|
| 500 |
+
out[path] = self.info(path)
|
| 501 |
+
|
| 502 |
+
for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
|
| 503 |
+
if withdirs:
|
| 504 |
+
files.update(dirs)
|
| 505 |
+
out.update({info["name"]: info for name, info in files.items()})
|
| 506 |
+
if not out and self.isfile(path):
|
| 507 |
+
# walk works on directories, but find should also return [path]
|
| 508 |
+
# when path happens to be a file
|
| 509 |
+
out[path] = {}
|
| 510 |
+
names = sorted(out)
|
| 511 |
+
if not detail:
|
| 512 |
+
return names
|
| 513 |
+
else:
|
| 514 |
+
return {name: out[name] for name in names}
|
| 515 |
+
|
| 516 |
+
def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
|
| 517 |
+
"""Space used by files and optionally directories within a path
|
| 518 |
+
|
| 519 |
+
Directory size does not include the size of its contents.
|
| 520 |
+
|
| 521 |
+
Parameters
|
| 522 |
+
----------
|
| 523 |
+
path: str
|
| 524 |
+
total: bool
|
| 525 |
+
Whether to sum all the file sizes
|
| 526 |
+
maxdepth: int or None
|
| 527 |
+
Maximum number of directory levels to descend, None for unlimited.
|
| 528 |
+
withdirs: bool
|
| 529 |
+
Whether to include directory paths in the output.
|
| 530 |
+
kwargs: passed to ``find``
|
| 531 |
+
|
| 532 |
+
Returns
|
| 533 |
+
-------
|
| 534 |
+
Dict of {path: size} if total=False, or int otherwise, where numbers
|
| 535 |
+
refer to bytes used.
|
| 536 |
+
"""
|
| 537 |
+
sizes = {}
|
| 538 |
+
if withdirs and self.isdir(path):
|
| 539 |
+
# Include top-level directory in output
|
| 540 |
+
info = self.info(path)
|
| 541 |
+
sizes[info["name"]] = info["size"]
|
| 542 |
+
for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
|
| 543 |
+
info = self.info(f)
|
| 544 |
+
sizes[info["name"]] = info["size"]
|
| 545 |
+
if total:
|
| 546 |
+
return sum(sizes.values())
|
| 547 |
+
else:
|
| 548 |
+
return sizes
|
| 549 |
+
|
| 550 |
+
def glob(self, path, maxdepth=None, **kwargs):
|
| 551 |
+
"""
|
| 552 |
+
Find files by glob-matching.
|
| 553 |
+
|
| 554 |
+
If the path ends with '/', only folders are returned.
|
| 555 |
+
|
| 556 |
+
We support ``"**"``,
|
| 557 |
+
``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.
|
| 558 |
+
|
| 559 |
+
The `maxdepth` option is applied on the first `**` found in the path.
|
| 560 |
+
|
| 561 |
+
kwargs are passed to ``ls``.
|
| 562 |
+
"""
|
| 563 |
+
if maxdepth is not None and maxdepth < 1:
|
| 564 |
+
raise ValueError("maxdepth must be at least 1")
|
| 565 |
+
|
| 566 |
+
import re
|
| 567 |
+
|
| 568 |
+
seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
|
| 569 |
+
ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
|
| 570 |
+
path = self._strip_protocol(path)
|
| 571 |
+
append_slash_to_dirname = ends_with_sep or path.endswith(
|
| 572 |
+
tuple(sep + "**" for sep in seps)
|
| 573 |
+
)
|
| 574 |
+
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
| 575 |
+
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
|
| 576 |
+
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
| 577 |
+
|
| 578 |
+
min_idx = min(idx_star, idx_qmark, idx_brace)
|
| 579 |
+
|
| 580 |
+
detail = kwargs.pop("detail", False)
|
| 581 |
+
|
| 582 |
+
if not has_magic(path):
|
| 583 |
+
if self.exists(path, **kwargs):
|
| 584 |
+
if not detail:
|
| 585 |
+
return [path]
|
| 586 |
+
else:
|
| 587 |
+
return {path: self.info(path, **kwargs)}
|
| 588 |
+
else:
|
| 589 |
+
if not detail:
|
| 590 |
+
return [] # glob of non-existent returns empty
|
| 591 |
+
else:
|
| 592 |
+
return {}
|
| 593 |
+
elif "/" in path[:min_idx]:
|
| 594 |
+
min_idx = path[:min_idx].rindex("/")
|
| 595 |
+
root = path[: min_idx + 1]
|
| 596 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
| 597 |
+
else:
|
| 598 |
+
root = ""
|
| 599 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
| 600 |
+
|
| 601 |
+
if "**" in path:
|
| 602 |
+
if maxdepth is not None:
|
| 603 |
+
idx_double_stars = path.find("**")
|
| 604 |
+
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
| 605 |
+
depth = depth - depth_double_stars + maxdepth
|
| 606 |
+
else:
|
| 607 |
+
depth = None
|
| 608 |
+
|
| 609 |
+
allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
|
| 610 |
+
|
| 611 |
+
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
|
| 612 |
+
pattern = re.compile(pattern)
|
| 613 |
+
|
| 614 |
+
out = {
|
| 615 |
+
p: info
|
| 616 |
+
for p, info in sorted(allpaths.items())
|
| 617 |
+
if pattern.match(
|
| 618 |
+
p + "/"
|
| 619 |
+
if append_slash_to_dirname and info["type"] == "directory"
|
| 620 |
+
else p
|
| 621 |
+
)
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
if detail:
|
| 625 |
+
return out
|
| 626 |
+
else:
|
| 627 |
+
return list(out)
|
| 628 |
+
|
| 629 |
+
def exists(self, path, **kwargs):
|
| 630 |
+
"""Is there a file at the given path"""
|
| 631 |
+
try:
|
| 632 |
+
self.info(path, **kwargs)
|
| 633 |
+
return True
|
| 634 |
+
except: # noqa: E722
|
| 635 |
+
# any exception allowed bar FileNotFoundError?
|
| 636 |
+
return False
|
| 637 |
+
|
| 638 |
+
def lexists(self, path, **kwargs):
|
| 639 |
+
"""If there is a file at the given path (including
|
| 640 |
+
broken links)"""
|
| 641 |
+
return self.exists(path)
|
| 642 |
+
|
| 643 |
+
def info(self, path, **kwargs):
|
| 644 |
+
"""Give details of entry at path
|
| 645 |
+
|
| 646 |
+
Returns a single dictionary, with exactly the same information as ``ls``
|
| 647 |
+
would with ``detail=True``.
|
| 648 |
+
|
| 649 |
+
The default implementation calls ls and could be overridden by a
|
| 650 |
+
shortcut. kwargs are passed on to ```ls()``.
|
| 651 |
+
|
| 652 |
+
Some file systems might not be able to measure the file's size, in
|
| 653 |
+
which case, the returned dict will include ``'size': None``.
|
| 654 |
+
|
| 655 |
+
Returns
|
| 656 |
+
-------
|
| 657 |
+
dict with keys: name (full path in the FS), size (in bytes), type (file,
|
| 658 |
+
directory, or something else) and other FS-specific keys.
|
| 659 |
+
"""
|
| 660 |
+
path = self._strip_protocol(path)
|
| 661 |
+
out = self.ls(self._parent(path), detail=True, **kwargs)
|
| 662 |
+
out = [o for o in out if o["name"].rstrip("/") == path]
|
| 663 |
+
if out:
|
| 664 |
+
return out[0]
|
| 665 |
+
out = self.ls(path, detail=True, **kwargs)
|
| 666 |
+
path = path.rstrip("/")
|
| 667 |
+
out1 = [o for o in out if o["name"].rstrip("/") == path]
|
| 668 |
+
if len(out1) == 1:
|
| 669 |
+
if "size" not in out1[0]:
|
| 670 |
+
out1[0]["size"] = None
|
| 671 |
+
return out1[0]
|
| 672 |
+
elif len(out1) > 1 or out:
|
| 673 |
+
return {"name": path, "size": 0, "type": "directory"}
|
| 674 |
+
else:
|
| 675 |
+
raise FileNotFoundError(path)
|
| 676 |
+
|
| 677 |
+
def checksum(self, path):
|
| 678 |
+
"""Unique value for current version of file
|
| 679 |
+
|
| 680 |
+
If the checksum is the same from one moment to another, the contents
|
| 681 |
+
are guaranteed to be the same. If the checksum changes, the contents
|
| 682 |
+
*might* have changed.
|
| 683 |
+
|
| 684 |
+
This should normally be overridden; default will probably capture
|
| 685 |
+
creation/modification timestamp (which would be good) or maybe
|
| 686 |
+
access timestamp (which would be bad)
|
| 687 |
+
"""
|
| 688 |
+
return int(tokenize(self.info(path)), 16)
|
| 689 |
+
|
| 690 |
+
def size(self, path):
|
| 691 |
+
"""Size in bytes of file"""
|
| 692 |
+
return self.info(path).get("size", None)
|
| 693 |
+
|
| 694 |
+
def sizes(self, paths):
|
| 695 |
+
"""Size in bytes of each file in a list of paths"""
|
| 696 |
+
return [self.size(p) for p in paths]
|
| 697 |
+
|
| 698 |
+
def isdir(self, path):
|
| 699 |
+
"""Is this entry directory-like?"""
|
| 700 |
+
try:
|
| 701 |
+
return self.info(path)["type"] == "directory"
|
| 702 |
+
except OSError:
|
| 703 |
+
return False
|
| 704 |
+
|
| 705 |
+
def isfile(self, path):
|
| 706 |
+
"""Is this entry file-like?"""
|
| 707 |
+
try:
|
| 708 |
+
return self.info(path)["type"] == "file"
|
| 709 |
+
except: # noqa: E722
|
| 710 |
+
return False
|
| 711 |
+
|
| 712 |
+
def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
|
| 713 |
+
"""Get the contents of the file as a string.
|
| 714 |
+
|
| 715 |
+
Parameters
|
| 716 |
+
----------
|
| 717 |
+
path: str
|
| 718 |
+
URL of file on this filesystems
|
| 719 |
+
encoding, errors, newline: same as `open`.
|
| 720 |
+
"""
|
| 721 |
+
with self.open(
|
| 722 |
+
path,
|
| 723 |
+
mode="r",
|
| 724 |
+
encoding=encoding,
|
| 725 |
+
errors=errors,
|
| 726 |
+
newline=newline,
|
| 727 |
+
**kwargs,
|
| 728 |
+
) as f:
|
| 729 |
+
return f.read()
|
| 730 |
+
|
| 731 |
+
def write_text(
|
| 732 |
+
self, path, value, encoding=None, errors=None, newline=None, **kwargs
|
| 733 |
+
):
|
| 734 |
+
"""Write the text to the given file.
|
| 735 |
+
|
| 736 |
+
An existing file will be overwritten.
|
| 737 |
+
|
| 738 |
+
Parameters
|
| 739 |
+
----------
|
| 740 |
+
path: str
|
| 741 |
+
URL of file on this filesystems
|
| 742 |
+
value: str
|
| 743 |
+
Text to write.
|
| 744 |
+
encoding, errors, newline: same as `open`.
|
| 745 |
+
"""
|
| 746 |
+
with self.open(
|
| 747 |
+
path,
|
| 748 |
+
mode="w",
|
| 749 |
+
encoding=encoding,
|
| 750 |
+
errors=errors,
|
| 751 |
+
newline=newline,
|
| 752 |
+
**kwargs,
|
| 753 |
+
) as f:
|
| 754 |
+
return f.write(value)
|
| 755 |
+
|
| 756 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
| 757 |
+
"""Get the content of a file
|
| 758 |
+
|
| 759 |
+
Parameters
|
| 760 |
+
----------
|
| 761 |
+
path: URL of file on this filesystems
|
| 762 |
+
start, end: int
|
| 763 |
+
Bytes limits of the read. If negative, backwards from end,
|
| 764 |
+
like usual python slices. Either can be None for start or
|
| 765 |
+
end of file, respectively
|
| 766 |
+
kwargs: passed to ``open()``.
|
| 767 |
+
"""
|
| 768 |
+
# explicitly set buffering off?
|
| 769 |
+
with self.open(path, "rb", **kwargs) as f:
|
| 770 |
+
if start is not None:
|
| 771 |
+
if start >= 0:
|
| 772 |
+
f.seek(start)
|
| 773 |
+
else:
|
| 774 |
+
f.seek(max(0, f.size + start))
|
| 775 |
+
if end is not None:
|
| 776 |
+
if end < 0:
|
| 777 |
+
end = f.size + end
|
| 778 |
+
return f.read(end - f.tell())
|
| 779 |
+
return f.read()
|
| 780 |
+
|
| 781 |
+
def pipe_file(self, path, value, mode="overwrite", **kwargs):
|
| 782 |
+
"""Set the bytes of given file"""
|
| 783 |
+
if mode == "create" and self.exists(path):
|
| 784 |
+
# non-atomic but simple way; or could use "xb" in open(), which is likely
|
| 785 |
+
# not as well supported
|
| 786 |
+
raise FileExistsError
|
| 787 |
+
with self.open(path, "wb", **kwargs) as f:
|
| 788 |
+
f.write(value)
|
| 789 |
+
|
| 790 |
+
def pipe(self, path, value=None, **kwargs):
|
| 791 |
+
"""Put value into path
|
| 792 |
+
|
| 793 |
+
(counterpart to ``cat``)
|
| 794 |
+
|
| 795 |
+
Parameters
|
| 796 |
+
----------
|
| 797 |
+
path: string or dict(str, bytes)
|
| 798 |
+
If a string, a single remote location to put ``value`` bytes; if a dict,
|
| 799 |
+
a mapping of {path: bytesvalue}.
|
| 800 |
+
value: bytes, optional
|
| 801 |
+
If using a single path, these are the bytes to put there. Ignored if
|
| 802 |
+
``path`` is a dict
|
| 803 |
+
"""
|
| 804 |
+
if isinstance(path, str):
|
| 805 |
+
self.pipe_file(self._strip_protocol(path), value, **kwargs)
|
| 806 |
+
elif isinstance(path, dict):
|
| 807 |
+
for k, v in path.items():
|
| 808 |
+
self.pipe_file(self._strip_protocol(k), v, **kwargs)
|
| 809 |
+
else:
|
| 810 |
+
raise ValueError("path must be str or dict")
|
| 811 |
+
|
| 812 |
+
def cat_ranges(
|
| 813 |
+
self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
|
| 814 |
+
):
|
| 815 |
+
"""Get the contents of byte ranges from one or more files
|
| 816 |
+
|
| 817 |
+
Parameters
|
| 818 |
+
----------
|
| 819 |
+
paths: list
|
| 820 |
+
A list of of filepaths on this filesystems
|
| 821 |
+
starts, ends: int or list
|
| 822 |
+
Bytes limits of the read. If using a single int, the same value will be
|
| 823 |
+
used to read all the specified files.
|
| 824 |
+
"""
|
| 825 |
+
if max_gap is not None:
|
| 826 |
+
raise NotImplementedError
|
| 827 |
+
if not isinstance(paths, list):
|
| 828 |
+
raise TypeError
|
| 829 |
+
if not isinstance(starts, list):
|
| 830 |
+
starts = [starts] * len(paths)
|
| 831 |
+
if not isinstance(ends, list):
|
| 832 |
+
ends = [ends] * len(paths)
|
| 833 |
+
if len(starts) != len(paths) or len(ends) != len(paths):
|
| 834 |
+
raise ValueError
|
| 835 |
+
out = []
|
| 836 |
+
for p, s, e in zip(paths, starts, ends):
|
| 837 |
+
try:
|
| 838 |
+
out.append(self.cat_file(p, s, e))
|
| 839 |
+
except Exception as e:
|
| 840 |
+
if on_error == "return":
|
| 841 |
+
out.append(e)
|
| 842 |
+
else:
|
| 843 |
+
raise
|
| 844 |
+
return out
|
| 845 |
+
|
| 846 |
+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
| 847 |
+
"""Fetch (potentially multiple) paths' contents
|
| 848 |
+
|
| 849 |
+
Parameters
|
| 850 |
+
----------
|
| 851 |
+
recursive: bool
|
| 852 |
+
If True, assume the path(s) are directories, and get all the
|
| 853 |
+
contained files
|
| 854 |
+
on_error : "raise", "omit", "return"
|
| 855 |
+
If raise, an underlying exception will be raised (converted to KeyError
|
| 856 |
+
if the type is in self.missing_exceptions); if omit, keys with exception
|
| 857 |
+
will simply not be included in the output; if "return", all keys are
|
| 858 |
+
included in the output, but the value will be bytes or an exception
|
| 859 |
+
instance.
|
| 860 |
+
kwargs: passed to cat_file
|
| 861 |
+
|
| 862 |
+
Returns
|
| 863 |
+
-------
|
| 864 |
+
dict of {path: contents} if there are multiple paths
|
| 865 |
+
or the path has been otherwise expanded
|
| 866 |
+
"""
|
| 867 |
+
paths = self.expand_path(path, recursive=recursive)
|
| 868 |
+
if (
|
| 869 |
+
len(paths) > 1
|
| 870 |
+
or isinstance(path, list)
|
| 871 |
+
or paths[0] != self._strip_protocol(path)
|
| 872 |
+
):
|
| 873 |
+
out = {}
|
| 874 |
+
for path in paths:
|
| 875 |
+
try:
|
| 876 |
+
out[path] = self.cat_file(path, **kwargs)
|
| 877 |
+
except Exception as e:
|
| 878 |
+
if on_error == "raise":
|
| 879 |
+
raise
|
| 880 |
+
if on_error == "return":
|
| 881 |
+
out[path] = e
|
| 882 |
+
return out
|
| 883 |
+
else:
|
| 884 |
+
return self.cat_file(paths[0], **kwargs)
|
| 885 |
+
|
| 886 |
+
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
|
| 887 |
+
"""Copy single remote file to local"""
|
| 888 |
+
from .implementations.local import LocalFileSystem
|
| 889 |
+
|
| 890 |
+
if isfilelike(lpath):
|
| 891 |
+
outfile = lpath
|
| 892 |
+
elif self.isdir(rpath):
|
| 893 |
+
os.makedirs(lpath, exist_ok=True)
|
| 894 |
+
return None
|
| 895 |
+
|
| 896 |
+
fs = LocalFileSystem(auto_mkdir=True)
|
| 897 |
+
fs.makedirs(fs._parent(lpath), exist_ok=True)
|
| 898 |
+
|
| 899 |
+
with self.open(rpath, "rb", **kwargs) as f1:
|
| 900 |
+
if outfile is None:
|
| 901 |
+
outfile = open(lpath, "wb")
|
| 902 |
+
|
| 903 |
+
try:
|
| 904 |
+
callback.set_size(getattr(f1, "size", None))
|
| 905 |
+
data = True
|
| 906 |
+
while data:
|
| 907 |
+
data = f1.read(self.blocksize)
|
| 908 |
+
segment_len = outfile.write(data)
|
| 909 |
+
if segment_len is None:
|
| 910 |
+
segment_len = len(data)
|
| 911 |
+
callback.relative_update(segment_len)
|
| 912 |
+
finally:
|
| 913 |
+
if not isfilelike(lpath):
|
| 914 |
+
outfile.close()
|
| 915 |
+
|
| 916 |
+
def get(
|
| 917 |
+
self,
|
| 918 |
+
rpath,
|
| 919 |
+
lpath,
|
| 920 |
+
recursive=False,
|
| 921 |
+
callback=DEFAULT_CALLBACK,
|
| 922 |
+
maxdepth=None,
|
| 923 |
+
**kwargs,
|
| 924 |
+
):
|
| 925 |
+
"""Copy file(s) to local.
|
| 926 |
+
|
| 927 |
+
Copies a specific file or tree of files (if recursive=True). If lpath
|
| 928 |
+
ends with a "/", it will be assumed to be a directory, and target files
|
| 929 |
+
will go within. Can submit a list of paths, which may be glob-patterns
|
| 930 |
+
and will be expanded.
|
| 931 |
+
|
| 932 |
+
Calls get_file for each source.
|
| 933 |
+
"""
|
| 934 |
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
| 935 |
+
# No need to expand paths when both source and destination
|
| 936 |
+
# are provided as lists
|
| 937 |
+
rpaths = rpath
|
| 938 |
+
lpaths = lpath
|
| 939 |
+
else:
|
| 940 |
+
from .implementations.local import (
|
| 941 |
+
LocalFileSystem,
|
| 942 |
+
make_path_posix,
|
| 943 |
+
trailing_sep,
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
+
source_is_str = isinstance(rpath, str)
|
| 947 |
+
rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
|
| 948 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 949 |
+
# Non-recursive glob does not copy directories
|
| 950 |
+
rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
|
| 951 |
+
if not rpaths:
|
| 952 |
+
return
|
| 953 |
+
|
| 954 |
+
if isinstance(lpath, str):
|
| 955 |
+
lpath = make_path_posix(lpath)
|
| 956 |
+
|
| 957 |
+
source_is_file = len(rpaths) == 1
|
| 958 |
+
dest_is_dir = isinstance(lpath, str) and (
|
| 959 |
+
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
exists = source_is_str and (
|
| 963 |
+
(has_magic(rpath) and source_is_file)
|
| 964 |
+
or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
|
| 965 |
+
)
|
| 966 |
+
lpaths = other_paths(
|
| 967 |
+
rpaths,
|
| 968 |
+
lpath,
|
| 969 |
+
exists=exists,
|
| 970 |
+
flatten=not source_is_str,
|
| 971 |
+
)
|
| 972 |
+
|
| 973 |
+
callback.set_size(len(lpaths))
|
| 974 |
+
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
|
| 975 |
+
with callback.branched(rpath, lpath) as child:
|
| 976 |
+
self.get_file(rpath, lpath, callback=child, **kwargs)
|
| 977 |
+
|
| 978 |
+
def put_file(
|
| 979 |
+
self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
|
| 980 |
+
):
|
| 981 |
+
"""Copy single file to remote"""
|
| 982 |
+
if mode == "create" and self.exists(rpath):
|
| 983 |
+
raise FileExistsError
|
| 984 |
+
if os.path.isdir(lpath):
|
| 985 |
+
self.makedirs(rpath, exist_ok=True)
|
| 986 |
+
return None
|
| 987 |
+
|
| 988 |
+
with open(lpath, "rb") as f1:
|
| 989 |
+
size = f1.seek(0, 2)
|
| 990 |
+
callback.set_size(size)
|
| 991 |
+
f1.seek(0)
|
| 992 |
+
|
| 993 |
+
self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
|
| 994 |
+
with self.open(rpath, "wb", **kwargs) as f2:
|
| 995 |
+
while f1.tell() < size:
|
| 996 |
+
data = f1.read(self.blocksize)
|
| 997 |
+
segment_len = f2.write(data)
|
| 998 |
+
if segment_len is None:
|
| 999 |
+
segment_len = len(data)
|
| 1000 |
+
callback.relative_update(segment_len)
|
| 1001 |
+
|
| 1002 |
+
def put(
|
| 1003 |
+
self,
|
| 1004 |
+
lpath,
|
| 1005 |
+
rpath,
|
| 1006 |
+
recursive=False,
|
| 1007 |
+
callback=DEFAULT_CALLBACK,
|
| 1008 |
+
maxdepth=None,
|
| 1009 |
+
**kwargs,
|
| 1010 |
+
):
|
| 1011 |
+
"""Copy file(s) from local.
|
| 1012 |
+
|
| 1013 |
+
Copies a specific file or tree of files (if recursive=True). If rpath
|
| 1014 |
+
ends with a "/", it will be assumed to be a directory, and target files
|
| 1015 |
+
will go within.
|
| 1016 |
+
|
| 1017 |
+
Calls put_file for each source.
|
| 1018 |
+
"""
|
| 1019 |
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
| 1020 |
+
# No need to expand paths when both source and destination
|
| 1021 |
+
# are provided as lists
|
| 1022 |
+
rpaths = rpath
|
| 1023 |
+
lpaths = lpath
|
| 1024 |
+
else:
|
| 1025 |
+
from .implementations.local import (
|
| 1026 |
+
LocalFileSystem,
|
| 1027 |
+
make_path_posix,
|
| 1028 |
+
trailing_sep,
|
| 1029 |
+
)
|
| 1030 |
+
|
| 1031 |
+
source_is_str = isinstance(lpath, str)
|
| 1032 |
+
if source_is_str:
|
| 1033 |
+
lpath = make_path_posix(lpath)
|
| 1034 |
+
fs = LocalFileSystem()
|
| 1035 |
+
lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
|
| 1036 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 1037 |
+
# Non-recursive glob does not copy directories
|
| 1038 |
+
lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
|
| 1039 |
+
if not lpaths:
|
| 1040 |
+
return
|
| 1041 |
+
|
| 1042 |
+
source_is_file = len(lpaths) == 1
|
| 1043 |
+
dest_is_dir = isinstance(rpath, str) and (
|
| 1044 |
+
trailing_sep(rpath) or self.isdir(rpath)
|
| 1045 |
+
)
|
| 1046 |
+
|
| 1047 |
+
rpath = (
|
| 1048 |
+
self._strip_protocol(rpath)
|
| 1049 |
+
if isinstance(rpath, str)
|
| 1050 |
+
else [self._strip_protocol(p) for p in rpath]
|
| 1051 |
+
)
|
| 1052 |
+
exists = source_is_str and (
|
| 1053 |
+
(has_magic(lpath) and source_is_file)
|
| 1054 |
+
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
|
| 1055 |
+
)
|
| 1056 |
+
rpaths = other_paths(
|
| 1057 |
+
lpaths,
|
| 1058 |
+
rpath,
|
| 1059 |
+
exists=exists,
|
| 1060 |
+
flatten=not source_is_str,
|
| 1061 |
+
)
|
| 1062 |
+
|
| 1063 |
+
callback.set_size(len(rpaths))
|
| 1064 |
+
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
|
| 1065 |
+
with callback.branched(lpath, rpath) as child:
|
| 1066 |
+
self.put_file(lpath, rpath, callback=child, **kwargs)
|
| 1067 |
+
|
| 1068 |
+
def head(self, path, size=1024):
|
| 1069 |
+
"""Get the first ``size`` bytes from file"""
|
| 1070 |
+
with self.open(path, "rb") as f:
|
| 1071 |
+
return f.read(size)
|
| 1072 |
+
|
| 1073 |
+
def tail(self, path, size=1024):
|
| 1074 |
+
"""Get the last ``size`` bytes from file"""
|
| 1075 |
+
with self.open(path, "rb") as f:
|
| 1076 |
+
f.seek(max(-size, -f.size), 2)
|
| 1077 |
+
return f.read()
|
| 1078 |
+
|
| 1079 |
+
def cp_file(self, path1, path2, **kwargs):
|
| 1080 |
+
raise NotImplementedError
|
| 1081 |
+
|
| 1082 |
+
def copy(
|
| 1083 |
+
self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
|
| 1084 |
+
):
|
| 1085 |
+
"""Copy within two locations in the filesystem
|
| 1086 |
+
|
| 1087 |
+
on_error : "raise", "ignore"
|
| 1088 |
+
If raise, any not-found exceptions will be raised; if ignore any
|
| 1089 |
+
not-found exceptions will cause the path to be skipped; defaults to
|
| 1090 |
+
raise unless recursive is true, where the default is ignore
|
| 1091 |
+
"""
|
| 1092 |
+
if on_error is None and recursive:
|
| 1093 |
+
on_error = "ignore"
|
| 1094 |
+
elif on_error is None:
|
| 1095 |
+
on_error = "raise"
|
| 1096 |
+
|
| 1097 |
+
if isinstance(path1, list) and isinstance(path2, list):
|
| 1098 |
+
# No need to expand paths when both source and destination
|
| 1099 |
+
# are provided as lists
|
| 1100 |
+
paths1 = path1
|
| 1101 |
+
paths2 = path2
|
| 1102 |
+
else:
|
| 1103 |
+
from .implementations.local import trailing_sep
|
| 1104 |
+
|
| 1105 |
+
source_is_str = isinstance(path1, str)
|
| 1106 |
+
paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
|
| 1107 |
+
if source_is_str and (not recursive or maxdepth is not None):
|
| 1108 |
+
# Non-recursive glob does not copy directories
|
| 1109 |
+
paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
|
| 1110 |
+
if not paths1:
|
| 1111 |
+
return
|
| 1112 |
+
|
| 1113 |
+
source_is_file = len(paths1) == 1
|
| 1114 |
+
dest_is_dir = isinstance(path2, str) and (
|
| 1115 |
+
trailing_sep(path2) or self.isdir(path2)
|
| 1116 |
+
)
|
| 1117 |
+
|
| 1118 |
+
exists = source_is_str and (
|
| 1119 |
+
(has_magic(path1) and source_is_file)
|
| 1120 |
+
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
|
| 1121 |
+
)
|
| 1122 |
+
paths2 = other_paths(
|
| 1123 |
+
paths1,
|
| 1124 |
+
path2,
|
| 1125 |
+
exists=exists,
|
| 1126 |
+
flatten=not source_is_str,
|
| 1127 |
+
)
|
| 1128 |
+
|
| 1129 |
+
for p1, p2 in zip(paths1, paths2):
|
| 1130 |
+
try:
|
| 1131 |
+
self.cp_file(p1, p2, **kwargs)
|
| 1132 |
+
except FileNotFoundError:
|
| 1133 |
+
if on_error == "raise":
|
| 1134 |
+
raise
|
| 1135 |
+
|
| 1136 |
+
def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
|
| 1137 |
+
"""Turn one or more globs or directories into a list of all matching paths
|
| 1138 |
+
to files or directories.
|
| 1139 |
+
|
| 1140 |
+
kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
|
| 1141 |
+
"""
|
| 1142 |
+
|
| 1143 |
+
if maxdepth is not None and maxdepth < 1:
|
| 1144 |
+
raise ValueError("maxdepth must be at least 1")
|
| 1145 |
+
|
| 1146 |
+
if isinstance(path, (str, os.PathLike)):
|
| 1147 |
+
out = self.expand_path([path], recursive, maxdepth)
|
| 1148 |
+
else:
|
| 1149 |
+
out = set()
|
| 1150 |
+
path = [self._strip_protocol(p) for p in path]
|
| 1151 |
+
for p in path:
|
| 1152 |
+
if has_magic(p):
|
| 1153 |
+
bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
|
| 1154 |
+
out |= bit
|
| 1155 |
+
if recursive:
|
| 1156 |
+
# glob call above expanded one depth so if maxdepth is defined
|
| 1157 |
+
# then decrement it in expand_path call below. If it is zero
|
| 1158 |
+
# after decrementing then avoid expand_path call.
|
| 1159 |
+
if maxdepth is not None and maxdepth <= 1:
|
| 1160 |
+
continue
|
| 1161 |
+
out |= set(
|
| 1162 |
+
self.expand_path(
|
| 1163 |
+
list(bit),
|
| 1164 |
+
recursive=recursive,
|
| 1165 |
+
maxdepth=maxdepth - 1 if maxdepth is not None else None,
|
| 1166 |
+
**kwargs,
|
| 1167 |
+
)
|
| 1168 |
+
)
|
| 1169 |
+
continue
|
| 1170 |
+
elif recursive:
|
| 1171 |
+
rec = set(
|
| 1172 |
+
self.find(
|
| 1173 |
+
p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
|
| 1174 |
+
)
|
| 1175 |
+
)
|
| 1176 |
+
out |= rec
|
| 1177 |
+
if p not in out and (recursive is False or self.exists(p)):
|
| 1178 |
+
# should only check once, for the root
|
| 1179 |
+
out.add(p)
|
| 1180 |
+
if not out:
|
| 1181 |
+
raise FileNotFoundError(path)
|
| 1182 |
+
return sorted(out)
|
| 1183 |
+
|
| 1184 |
+
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
|
| 1185 |
+
"""Move file(s) from one location to another"""
|
| 1186 |
+
if path1 == path2:
|
| 1187 |
+
logger.debug("%s mv: The paths are the same, so no files were moved.", self)
|
| 1188 |
+
else:
|
| 1189 |
+
# explicitly raise exception to prevent data corruption
|
| 1190 |
+
self.copy(
|
| 1191 |
+
path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
|
| 1192 |
+
)
|
| 1193 |
+
self.rm(path1, recursive=recursive)
|
| 1194 |
+
|
| 1195 |
+
def rm_file(self, path):
|
| 1196 |
+
"""Delete a file"""
|
| 1197 |
+
self._rm(path)
|
| 1198 |
+
|
| 1199 |
+
def _rm(self, path):
|
| 1200 |
+
"""Delete one file"""
|
| 1201 |
+
# this is the old name for the method, prefer rm_file
|
| 1202 |
+
raise NotImplementedError
|
| 1203 |
+
|
| 1204 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
| 1205 |
+
"""Delete files.
|
| 1206 |
+
|
| 1207 |
+
Parameters
|
| 1208 |
+
----------
|
| 1209 |
+
path: str or list of str
|
| 1210 |
+
File(s) to delete.
|
| 1211 |
+
recursive: bool
|
| 1212 |
+
If file(s) are directories, recursively delete contents and then
|
| 1213 |
+
also remove the directory
|
| 1214 |
+
maxdepth: int or None
|
| 1215 |
+
Depth to pass to walk for finding files to delete, if recursive.
|
| 1216 |
+
If None, there will be no limit and infinite recursion may be
|
| 1217 |
+
possible.
|
| 1218 |
+
"""
|
| 1219 |
+
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
| 1220 |
+
for p in reversed(path):
|
| 1221 |
+
self.rm_file(p)
|
| 1222 |
+
|
| 1223 |
+
@classmethod
|
| 1224 |
+
def _parent(cls, path):
|
| 1225 |
+
path = cls._strip_protocol(path)
|
| 1226 |
+
if "/" in path:
|
| 1227 |
+
parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
|
| 1228 |
+
return cls.root_marker + parent
|
| 1229 |
+
else:
|
| 1230 |
+
return cls.root_marker
|
| 1231 |
+
|
| 1232 |
+
def _open(
|
| 1233 |
+
self,
|
| 1234 |
+
path,
|
| 1235 |
+
mode="rb",
|
| 1236 |
+
block_size=None,
|
| 1237 |
+
autocommit=True,
|
| 1238 |
+
cache_options=None,
|
| 1239 |
+
**kwargs,
|
| 1240 |
+
):
|
| 1241 |
+
"""Return raw bytes-mode file-like from the file-system"""
|
| 1242 |
+
return AbstractBufferedFile(
|
| 1243 |
+
self,
|
| 1244 |
+
path,
|
| 1245 |
+
mode,
|
| 1246 |
+
block_size,
|
| 1247 |
+
autocommit,
|
| 1248 |
+
cache_options=cache_options,
|
| 1249 |
+
**kwargs,
|
| 1250 |
+
)
|
| 1251 |
+
|
| 1252 |
+
def open(
|
| 1253 |
+
self,
|
| 1254 |
+
path,
|
| 1255 |
+
mode="rb",
|
| 1256 |
+
block_size=None,
|
| 1257 |
+
cache_options=None,
|
| 1258 |
+
compression=None,
|
| 1259 |
+
**kwargs,
|
| 1260 |
+
):
|
| 1261 |
+
"""
|
| 1262 |
+
Return a file-like object from the filesystem
|
| 1263 |
+
|
| 1264 |
+
The resultant instance must function correctly in a context ``with``
|
| 1265 |
+
block.
|
| 1266 |
+
|
| 1267 |
+
Parameters
|
| 1268 |
+
----------
|
| 1269 |
+
path: str
|
| 1270 |
+
Target file
|
| 1271 |
+
mode: str like 'rb', 'w'
|
| 1272 |
+
See builtin ``open()``
|
| 1273 |
+
Mode "x" (exclusive write) may be implemented by the backend. Even if
|
| 1274 |
+
it is, whether it is checked up front or on commit, and whether it is
|
| 1275 |
+
atomic is implementation-dependent.
|
| 1276 |
+
block_size: int
|
| 1277 |
+
Some indication of buffering - this is a value in bytes
|
| 1278 |
+
cache_options : dict, optional
|
| 1279 |
+
Extra arguments to pass through to the cache.
|
| 1280 |
+
compression: string or None
|
| 1281 |
+
If given, open file using compression codec. Can either be a compression
|
| 1282 |
+
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
| 1283 |
+
compression from the filename suffix.
|
| 1284 |
+
encoding, errors, newline: passed on to TextIOWrapper for text mode
|
| 1285 |
+
"""
|
| 1286 |
+
import io
|
| 1287 |
+
|
| 1288 |
+
path = self._strip_protocol(path)
|
| 1289 |
+
if "b" not in mode:
|
| 1290 |
+
mode = mode.replace("t", "") + "b"
|
| 1291 |
+
|
| 1292 |
+
text_kwargs = {
|
| 1293 |
+
k: kwargs.pop(k)
|
| 1294 |
+
for k in ["encoding", "errors", "newline"]
|
| 1295 |
+
if k in kwargs
|
| 1296 |
+
}
|
| 1297 |
+
return io.TextIOWrapper(
|
| 1298 |
+
self.open(
|
| 1299 |
+
path,
|
| 1300 |
+
mode,
|
| 1301 |
+
block_size=block_size,
|
| 1302 |
+
cache_options=cache_options,
|
| 1303 |
+
compression=compression,
|
| 1304 |
+
**kwargs,
|
| 1305 |
+
),
|
| 1306 |
+
**text_kwargs,
|
| 1307 |
+
)
|
| 1308 |
+
else:
|
| 1309 |
+
ac = kwargs.pop("autocommit", not self._intrans)
|
| 1310 |
+
f = self._open(
|
| 1311 |
+
path,
|
| 1312 |
+
mode=mode,
|
| 1313 |
+
block_size=block_size,
|
| 1314 |
+
autocommit=ac,
|
| 1315 |
+
cache_options=cache_options,
|
| 1316 |
+
**kwargs,
|
| 1317 |
+
)
|
| 1318 |
+
if compression is not None:
|
| 1319 |
+
from fsspec.compression import compr
|
| 1320 |
+
from fsspec.core import get_compression
|
| 1321 |
+
|
| 1322 |
+
compression = get_compression(path, compression)
|
| 1323 |
+
compress = compr[compression]
|
| 1324 |
+
f = compress(f, mode=mode[0])
|
| 1325 |
+
|
| 1326 |
+
if not ac and "r" not in mode:
|
| 1327 |
+
self.transaction.files.append(f)
|
| 1328 |
+
return f
|
| 1329 |
+
|
| 1330 |
+
def touch(self, path, truncate=True, **kwargs):
|
| 1331 |
+
"""Create empty file, or update timestamp
|
| 1332 |
+
|
| 1333 |
+
Parameters
|
| 1334 |
+
----------
|
| 1335 |
+
path: str
|
| 1336 |
+
file location
|
| 1337 |
+
truncate: bool
|
| 1338 |
+
If True, always set file size to 0; if False, update timestamp and
|
| 1339 |
+
leave file unchanged, if backend allows this
|
| 1340 |
+
"""
|
| 1341 |
+
if truncate or not self.exists(path):
|
| 1342 |
+
with self.open(path, "wb", **kwargs):
|
| 1343 |
+
pass
|
| 1344 |
+
else:
|
| 1345 |
+
raise NotImplementedError # update timestamp, if possible
|
| 1346 |
+
|
| 1347 |
+
def ukey(self, path):
|
| 1348 |
+
"""Hash of file properties, to tell if it has changed"""
|
| 1349 |
+
return sha256(str(self.info(path)).encode()).hexdigest()
|
| 1350 |
+
|
| 1351 |
+
def read_block(self, fn, offset, length, delimiter=None):
|
| 1352 |
+
"""Read a block of bytes from
|
| 1353 |
+
|
| 1354 |
+
Starting at ``offset`` of the file, read ``length`` bytes. If
|
| 1355 |
+
``delimiter`` is set then we ensure that the read starts and stops at
|
| 1356 |
+
delimiter boundaries that follow the locations ``offset`` and ``offset
|
| 1357 |
+
+ length``. If ``offset`` is zero then we start at zero. The
|
| 1358 |
+
bytestring returned WILL include the end delimiter string.
|
| 1359 |
+
|
| 1360 |
+
If offset+length is beyond the eof, reads to eof.
|
| 1361 |
+
|
| 1362 |
+
Parameters
|
| 1363 |
+
----------
|
| 1364 |
+
fn: string
|
| 1365 |
+
Path to filename
|
| 1366 |
+
offset: int
|
| 1367 |
+
Byte offset to start read
|
| 1368 |
+
length: int
|
| 1369 |
+
Number of bytes to read. If None, read to end.
|
| 1370 |
+
delimiter: bytes (optional)
|
| 1371 |
+
Ensure reading starts and stops at delimiter bytestring
|
| 1372 |
+
|
| 1373 |
+
Examples
|
| 1374 |
+
--------
|
| 1375 |
+
>>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
|
| 1376 |
+
b'Alice, 100\\nBo'
|
| 1377 |
+
>>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
|
| 1378 |
+
b'Alice, 100\\nBob, 200\\n'
|
| 1379 |
+
|
| 1380 |
+
Use ``length=None`` to read to the end of the file.
|
| 1381 |
+
>>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
|
| 1382 |
+
b'Alice, 100\\nBob, 200\\nCharlie, 300'
|
| 1383 |
+
|
| 1384 |
+
See Also
|
| 1385 |
+
--------
|
| 1386 |
+
:func:`fsspec.utils.read_block`
|
| 1387 |
+
"""
|
| 1388 |
+
with self.open(fn, "rb") as f:
|
| 1389 |
+
size = f.size
|
| 1390 |
+
if length is None:
|
| 1391 |
+
length = size
|
| 1392 |
+
if size is not None and offset + length > size:
|
| 1393 |
+
length = size - offset
|
| 1394 |
+
return read_block(f, offset, length, delimiter)
|
| 1395 |
+
|
| 1396 |
+
def to_json(self, *, include_password: bool = True) -> str:
|
| 1397 |
+
"""
|
| 1398 |
+
JSON representation of this filesystem instance.
|
| 1399 |
+
|
| 1400 |
+
Parameters
|
| 1401 |
+
----------
|
| 1402 |
+
include_password: bool, default True
|
| 1403 |
+
Whether to include the password (if any) in the output.
|
| 1404 |
+
|
| 1405 |
+
Returns
|
| 1406 |
+
-------
|
| 1407 |
+
JSON string with keys ``cls`` (the python location of this class),
|
| 1408 |
+
protocol (text name of this class's protocol, first one in case of
|
| 1409 |
+
multiple), ``args`` (positional args, usually empty), and all other
|
| 1410 |
+
keyword arguments as their own keys.
|
| 1411 |
+
|
| 1412 |
+
Warnings
|
| 1413 |
+
--------
|
| 1414 |
+
Serialized filesystems may contain sensitive information which have been
|
| 1415 |
+
passed to the constructor, such as passwords and tokens. Make sure you
|
| 1416 |
+
store and send them in a secure environment!
|
| 1417 |
+
"""
|
| 1418 |
+
from .json import FilesystemJSONEncoder
|
| 1419 |
+
|
| 1420 |
+
return json.dumps(
|
| 1421 |
+
self,
|
| 1422 |
+
cls=type(
|
| 1423 |
+
"_FilesystemJSONEncoder",
|
| 1424 |
+
(FilesystemJSONEncoder,),
|
| 1425 |
+
{"include_password": include_password},
|
| 1426 |
+
),
|
| 1427 |
+
)
|
| 1428 |
+
|
| 1429 |
+
@staticmethod
|
| 1430 |
+
def from_json(blob: str) -> AbstractFileSystem:
|
| 1431 |
+
"""
|
| 1432 |
+
Recreate a filesystem instance from JSON representation.
|
| 1433 |
+
|
| 1434 |
+
See ``.to_json()`` for the expected structure of the input.
|
| 1435 |
+
|
| 1436 |
+
Parameters
|
| 1437 |
+
----------
|
| 1438 |
+
blob: str
|
| 1439 |
+
|
| 1440 |
+
Returns
|
| 1441 |
+
-------
|
| 1442 |
+
file system instance, not necessarily of this particular class.
|
| 1443 |
+
|
| 1444 |
+
Warnings
|
| 1445 |
+
--------
|
| 1446 |
+
This can import arbitrary modules (as determined by the ``cls`` key).
|
| 1447 |
+
Make sure you haven't installed any modules that may execute malicious code
|
| 1448 |
+
at import time.
|
| 1449 |
+
"""
|
| 1450 |
+
from .json import FilesystemJSONDecoder
|
| 1451 |
+
|
| 1452 |
+
return json.loads(blob, cls=FilesystemJSONDecoder)
|
| 1453 |
+
|
| 1454 |
+
def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
|
| 1455 |
+
"""
|
| 1456 |
+
JSON-serializable dictionary representation of this filesystem instance.
|
| 1457 |
+
|
| 1458 |
+
Parameters
|
| 1459 |
+
----------
|
| 1460 |
+
include_password: bool, default True
|
| 1461 |
+
Whether to include the password (if any) in the output.
|
| 1462 |
+
|
| 1463 |
+
Returns
|
| 1464 |
+
-------
|
| 1465 |
+
Dictionary with keys ``cls`` (the python location of this class),
|
| 1466 |
+
protocol (text name of this class's protocol, first one in case of
|
| 1467 |
+
multiple), ``args`` (positional args, usually empty), and all other
|
| 1468 |
+
keyword arguments as their own keys.
|
| 1469 |
+
|
| 1470 |
+
Warnings
|
| 1471 |
+
--------
|
| 1472 |
+
Serialized filesystems may contain sensitive information which have been
|
| 1473 |
+
passed to the constructor, such as passwords and tokens. Make sure you
|
| 1474 |
+
store and send them in a secure environment!
|
| 1475 |
+
"""
|
| 1476 |
+
from .json import FilesystemJSONEncoder
|
| 1477 |
+
|
| 1478 |
+
json_encoder = FilesystemJSONEncoder()
|
| 1479 |
+
|
| 1480 |
+
cls = type(self)
|
| 1481 |
+
proto = self.protocol
|
| 1482 |
+
|
| 1483 |
+
storage_options = dict(self.storage_options)
|
| 1484 |
+
if not include_password:
|
| 1485 |
+
storage_options.pop("password", None)
|
| 1486 |
+
|
| 1487 |
+
return dict(
|
| 1488 |
+
cls=f"{cls.__module__}:{cls.__name__}",
|
| 1489 |
+
protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
|
| 1490 |
+
args=json_encoder.make_serializable(self.storage_args),
|
| 1491 |
+
**json_encoder.make_serializable(storage_options),
|
| 1492 |
+
)
|
| 1493 |
+
|
| 1494 |
+
@staticmethod
|
| 1495 |
+
def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
|
| 1496 |
+
"""
|
| 1497 |
+
Recreate a filesystem instance from dictionary representation.
|
| 1498 |
+
|
| 1499 |
+
See ``.to_dict()`` for the expected structure of the input.
|
| 1500 |
+
|
| 1501 |
+
Parameters
|
| 1502 |
+
----------
|
| 1503 |
+
dct: Dict[str, Any]
|
| 1504 |
+
|
| 1505 |
+
Returns
|
| 1506 |
+
-------
|
| 1507 |
+
file system instance, not necessarily of this particular class.
|
| 1508 |
+
|
| 1509 |
+
Warnings
|
| 1510 |
+
--------
|
| 1511 |
+
This can import arbitrary modules (as determined by the ``cls`` key).
|
| 1512 |
+
Make sure you haven't installed any modules that may execute malicious code
|
| 1513 |
+
at import time.
|
| 1514 |
+
"""
|
| 1515 |
+
from .json import FilesystemJSONDecoder
|
| 1516 |
+
|
| 1517 |
+
json_decoder = FilesystemJSONDecoder()
|
| 1518 |
+
|
| 1519 |
+
dct = dict(dct) # Defensive copy
|
| 1520 |
+
|
| 1521 |
+
cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
|
| 1522 |
+
if cls is None:
|
| 1523 |
+
raise ValueError("Not a serialized AbstractFileSystem")
|
| 1524 |
+
|
| 1525 |
+
dct.pop("cls", None)
|
| 1526 |
+
dct.pop("protocol", None)
|
| 1527 |
+
|
| 1528 |
+
return cls(
|
| 1529 |
+
*json_decoder.unmake_serializable(dct.pop("args", ())),
|
| 1530 |
+
**json_decoder.unmake_serializable(dct),
|
| 1531 |
+
)
|
| 1532 |
+
|
| 1533 |
+
def _get_pyarrow_filesystem(self):
|
| 1534 |
+
"""
|
| 1535 |
+
Make a version of the FS instance which will be acceptable to pyarrow
|
| 1536 |
+
"""
|
| 1537 |
+
# all instances already also derive from pyarrow
|
| 1538 |
+
return self
|
| 1539 |
+
|
| 1540 |
+
def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
|
| 1541 |
+
"""Create key/value store based on this file-system
|
| 1542 |
+
|
| 1543 |
+
Makes a MutableMapping interface to the FS at the given root path.
|
| 1544 |
+
See ``fsspec.mapping.FSMap`` for further details.
|
| 1545 |
+
"""
|
| 1546 |
+
from .mapping import FSMap
|
| 1547 |
+
|
| 1548 |
+
return FSMap(
|
| 1549 |
+
root,
|
| 1550 |
+
self,
|
| 1551 |
+
check=check,
|
| 1552 |
+
create=create,
|
| 1553 |
+
missing_exceptions=missing_exceptions,
|
| 1554 |
+
)
|
| 1555 |
+
|
| 1556 |
+
@classmethod
|
| 1557 |
+
def clear_instance_cache(cls):
|
| 1558 |
+
"""
|
| 1559 |
+
Clear the cache of filesystem instances.
|
| 1560 |
+
|
| 1561 |
+
Notes
|
| 1562 |
+
-----
|
| 1563 |
+
Unless overridden by setting the ``cachable`` class attribute to False,
|
| 1564 |
+
the filesystem class stores a reference to newly created instances. This
|
| 1565 |
+
prevents Python's normal rules around garbage collection from working,
|
| 1566 |
+
since the instances refcount will not drop to zero until
|
| 1567 |
+
``clear_instance_cache`` is called.
|
| 1568 |
+
"""
|
| 1569 |
+
cls._cache.clear()
|
| 1570 |
+
|
| 1571 |
+
def created(self, path):
|
| 1572 |
+
"""Return the created timestamp of a file as a datetime.datetime"""
|
| 1573 |
+
raise NotImplementedError
|
| 1574 |
+
|
| 1575 |
+
def modified(self, path):
|
| 1576 |
+
"""Return the modified timestamp of a file as a datetime.datetime"""
|
| 1577 |
+
raise NotImplementedError
|
| 1578 |
+
|
| 1579 |
+
def tree(
|
| 1580 |
+
self,
|
| 1581 |
+
path: str = "/",
|
| 1582 |
+
recursion_limit: int = 2,
|
| 1583 |
+
max_display: int = 25,
|
| 1584 |
+
display_size: bool = False,
|
| 1585 |
+
prefix: str = "",
|
| 1586 |
+
is_last: bool = True,
|
| 1587 |
+
first: bool = True,
|
| 1588 |
+
indent_size: int = 4,
|
| 1589 |
+
) -> str:
|
| 1590 |
+
"""
|
| 1591 |
+
Return a tree-like structure of the filesystem starting from the given path as a string.
|
| 1592 |
+
|
| 1593 |
+
Parameters
|
| 1594 |
+
----------
|
| 1595 |
+
path: Root path to start traversal from
|
| 1596 |
+
recursion_limit: Maximum depth of directory traversal
|
| 1597 |
+
max_display: Maximum number of items to display per directory
|
| 1598 |
+
display_size: Whether to display file sizes
|
| 1599 |
+
prefix: Current line prefix for visual tree structure
|
| 1600 |
+
is_last: Whether current item is last in its level
|
| 1601 |
+
first: Whether this is the first call (displays root path)
|
| 1602 |
+
indent_size: Number of spaces by indent
|
| 1603 |
+
|
| 1604 |
+
Returns
|
| 1605 |
+
-------
|
| 1606 |
+
str: A string representing the tree structure.
|
| 1607 |
+
|
| 1608 |
+
Example
|
| 1609 |
+
-------
|
| 1610 |
+
>>> from fsspec import filesystem
|
| 1611 |
+
|
| 1612 |
+
>>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
|
| 1613 |
+
>>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
|
| 1614 |
+
>>> print(tree)
|
| 1615 |
+
"""
|
| 1616 |
+
|
| 1617 |
+
def format_bytes(n: int) -> str:
|
| 1618 |
+
"""Format bytes as text."""
|
| 1619 |
+
for prefix, k in (
|
| 1620 |
+
("P", 2**50),
|
| 1621 |
+
("T", 2**40),
|
| 1622 |
+
("G", 2**30),
|
| 1623 |
+
("M", 2**20),
|
| 1624 |
+
("k", 2**10),
|
| 1625 |
+
):
|
| 1626 |
+
if n >= 0.9 * k:
|
| 1627 |
+
return f"{n / k:.2f} {prefix}b"
|
| 1628 |
+
return f"{n}B"
|
| 1629 |
+
|
| 1630 |
+
result = []
|
| 1631 |
+
|
| 1632 |
+
if first:
|
| 1633 |
+
result.append(path)
|
| 1634 |
+
|
| 1635 |
+
if recursion_limit:
|
| 1636 |
+
indent = " " * indent_size
|
| 1637 |
+
contents = self.ls(path, detail=True)
|
| 1638 |
+
contents.sort(
|
| 1639 |
+
key=lambda x: (x.get("type") != "directory", x.get("name", ""))
|
| 1640 |
+
)
|
| 1641 |
+
|
| 1642 |
+
if max_display is not None and len(contents) > max_display:
|
| 1643 |
+
displayed_contents = contents[:max_display]
|
| 1644 |
+
remaining_count = len(contents) - max_display
|
| 1645 |
+
else:
|
| 1646 |
+
displayed_contents = contents
|
| 1647 |
+
remaining_count = 0
|
| 1648 |
+
|
| 1649 |
+
for i, item in enumerate(displayed_contents):
|
| 1650 |
+
is_last_item = (i == len(displayed_contents) - 1) and (
|
| 1651 |
+
remaining_count == 0
|
| 1652 |
+
)
|
| 1653 |
+
|
| 1654 |
+
branch = (
|
| 1655 |
+
"└" + ("─" * (indent_size - 2))
|
| 1656 |
+
if is_last_item
|
| 1657 |
+
else "├" + ("─" * (indent_size - 2))
|
| 1658 |
+
)
|
| 1659 |
+
branch += " "
|
| 1660 |
+
new_prefix = prefix + (
|
| 1661 |
+
indent if is_last_item else "│" + " " * (indent_size - 1)
|
| 1662 |
+
)
|
| 1663 |
+
|
| 1664 |
+
name = os.path.basename(item.get("name", ""))
|
| 1665 |
+
|
| 1666 |
+
if display_size and item.get("type") == "directory":
|
| 1667 |
+
sub_contents = self.ls(item.get("name", ""), detail=True)
|
| 1668 |
+
num_files = sum(
|
| 1669 |
+
1 for sub_item in sub_contents if sub_item.get("type") == "file"
|
| 1670 |
+
)
|
| 1671 |
+
num_folders = sum(
|
| 1672 |
+
1
|
| 1673 |
+
for sub_item in sub_contents
|
| 1674 |
+
if sub_item.get("type") == "directory"
|
| 1675 |
+
)
|
| 1676 |
+
|
| 1677 |
+
if num_files == 0 and num_folders == 0:
|
| 1678 |
+
size = " (empty folder)"
|
| 1679 |
+
elif num_files == 0:
|
| 1680 |
+
size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
|
| 1681 |
+
elif num_folders == 0:
|
| 1682 |
+
size = f" ({num_files} file{'s' if num_files > 1 else ''})"
|
| 1683 |
+
else:
|
| 1684 |
+
size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
|
| 1685 |
+
elif display_size and item.get("type") == "file":
|
| 1686 |
+
size = f" ({format_bytes(item.get('size', 0))})"
|
| 1687 |
+
else:
|
| 1688 |
+
size = ""
|
| 1689 |
+
|
| 1690 |
+
result.append(f"{prefix}{branch}{name}{size}")
|
| 1691 |
+
|
| 1692 |
+
if item.get("type") == "directory" and recursion_limit > 0:
|
| 1693 |
+
result.append(
|
| 1694 |
+
self.tree(
|
| 1695 |
+
path=item.get("name", ""),
|
| 1696 |
+
recursion_limit=recursion_limit - 1,
|
| 1697 |
+
max_display=max_display,
|
| 1698 |
+
display_size=display_size,
|
| 1699 |
+
prefix=new_prefix,
|
| 1700 |
+
is_last=is_last_item,
|
| 1701 |
+
first=False,
|
| 1702 |
+
indent_size=indent_size,
|
| 1703 |
+
)
|
| 1704 |
+
)
|
| 1705 |
+
|
| 1706 |
+
if remaining_count > 0:
|
| 1707 |
+
more_message = f"{remaining_count} more item(s) not displayed."
|
| 1708 |
+
result.append(
|
| 1709 |
+
f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
|
| 1710 |
+
)
|
| 1711 |
+
|
| 1712 |
+
return "\n".join(_ for _ in result if _)
|
| 1713 |
+
|
| 1714 |
+
# ------------------------------------------------------------------------
|
| 1715 |
+
# Aliases
|
| 1716 |
+
|
| 1717 |
+
def read_bytes(self, path, start=None, end=None, **kwargs):
|
| 1718 |
+
"""Alias of `AbstractFileSystem.cat_file`."""
|
| 1719 |
+
return self.cat_file(path, start=start, end=end, **kwargs)
|
| 1720 |
+
|
| 1721 |
+
def write_bytes(self, path, value, **kwargs):
|
| 1722 |
+
"""Alias of `AbstractFileSystem.pipe_file`."""
|
| 1723 |
+
self.pipe_file(path, value, **kwargs)
|
| 1724 |
+
|
| 1725 |
+
def makedir(self, path, create_parents=True, **kwargs):
|
| 1726 |
+
"""Alias of `AbstractFileSystem.mkdir`."""
|
| 1727 |
+
return self.mkdir(path, create_parents=create_parents, **kwargs)
|
| 1728 |
+
|
| 1729 |
+
def mkdirs(self, path, exist_ok=False):
|
| 1730 |
+
"""Alias of `AbstractFileSystem.makedirs`."""
|
| 1731 |
+
return self.makedirs(path, exist_ok=exist_ok)
|
| 1732 |
+
|
| 1733 |
+
def listdir(self, path, detail=True, **kwargs):
|
| 1734 |
+
"""Alias of `AbstractFileSystem.ls`."""
|
| 1735 |
+
return self.ls(path, detail=detail, **kwargs)
|
| 1736 |
+
|
| 1737 |
+
def cp(self, path1, path2, **kwargs):
|
| 1738 |
+
"""Alias of `AbstractFileSystem.copy`."""
|
| 1739 |
+
return self.copy(path1, path2, **kwargs)
|
| 1740 |
+
|
| 1741 |
+
def move(self, path1, path2, **kwargs):
|
| 1742 |
+
"""Alias of `AbstractFileSystem.mv`."""
|
| 1743 |
+
return self.mv(path1, path2, **kwargs)
|
| 1744 |
+
|
| 1745 |
+
def stat(self, path, **kwargs):
|
| 1746 |
+
"""Alias of `AbstractFileSystem.info`."""
|
| 1747 |
+
return self.info(path, **kwargs)
|
| 1748 |
+
|
| 1749 |
+
def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
|
| 1750 |
+
"""Alias of `AbstractFileSystem.du`."""
|
| 1751 |
+
return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
|
| 1752 |
+
|
| 1753 |
+
def rename(self, path1, path2, **kwargs):
|
| 1754 |
+
"""Alias of `AbstractFileSystem.mv`."""
|
| 1755 |
+
return self.mv(path1, path2, **kwargs)
|
| 1756 |
+
|
| 1757 |
+
def delete(self, path, recursive=False, maxdepth=None):
|
| 1758 |
+
"""Alias of `AbstractFileSystem.rm`."""
|
| 1759 |
+
return self.rm(path, recursive=recursive, maxdepth=maxdepth)
|
| 1760 |
+
|
| 1761 |
+
def upload(self, lpath, rpath, recursive=False, **kwargs):
|
| 1762 |
+
"""Alias of `AbstractFileSystem.put`."""
|
| 1763 |
+
return self.put(lpath, rpath, recursive=recursive, **kwargs)
|
| 1764 |
+
|
| 1765 |
+
def download(self, rpath, lpath, recursive=False, **kwargs):
|
| 1766 |
+
"""Alias of `AbstractFileSystem.get`."""
|
| 1767 |
+
return self.get(rpath, lpath, recursive=recursive, **kwargs)
|
| 1768 |
+
|
| 1769 |
+
def sign(self, path, expiration=100, **kwargs):
|
| 1770 |
+
"""Create a signed URL representing the given path
|
| 1771 |
+
|
| 1772 |
+
Some implementations allow temporary URLs to be generated, as a
|
| 1773 |
+
way of delegating credentials.
|
| 1774 |
+
|
| 1775 |
+
Parameters
|
| 1776 |
+
----------
|
| 1777 |
+
path : str
|
| 1778 |
+
The path on the filesystem
|
| 1779 |
+
expiration : int
|
| 1780 |
+
Number of seconds to enable the URL for (if supported)
|
| 1781 |
+
|
| 1782 |
+
Returns
|
| 1783 |
+
-------
|
| 1784 |
+
URL : str
|
| 1785 |
+
The signed URL
|
| 1786 |
+
|
| 1787 |
+
Raises
|
| 1788 |
+
------
|
| 1789 |
+
NotImplementedError : if method is not implemented for a filesystem
|
| 1790 |
+
"""
|
| 1791 |
+
raise NotImplementedError("Sign is not implemented for this filesystem")
|
| 1792 |
+
|
| 1793 |
+
def _isfilestore(self):
|
| 1794 |
+
# Originally inherited from pyarrow DaskFileSystem. Keeping this
|
| 1795 |
+
# here for backwards compatibility as long as pyarrow uses its
|
| 1796 |
+
# legacy fsspec-compatible filesystems and thus accepts fsspec
|
| 1797 |
+
# filesystems as well
|
| 1798 |
+
return False
|
| 1799 |
+
|
| 1800 |
+
|
| 1801 |
+
class AbstractBufferedFile(io.IOBase):
|
| 1802 |
+
"""Convenient class to derive from to provide buffering
|
| 1803 |
+
|
| 1804 |
+
In the case that the backend does not provide a pythonic file-like object
|
| 1805 |
+
already, this class contains much of the logic to build one. The only
|
| 1806 |
+
methods that need to be overridden are ``_upload_chunk``,
|
| 1807 |
+
``_initiate_upload`` and ``_fetch_range``.
|
| 1808 |
+
"""
|
| 1809 |
+
|
| 1810 |
+
DEFAULT_BLOCK_SIZE = 5 * 2**20
|
| 1811 |
+
_details = None
|
| 1812 |
+
|
| 1813 |
+
def __init__(
|
| 1814 |
+
self,
|
| 1815 |
+
fs,
|
| 1816 |
+
path,
|
| 1817 |
+
mode="rb",
|
| 1818 |
+
block_size="default",
|
| 1819 |
+
autocommit=True,
|
| 1820 |
+
cache_type="readahead",
|
| 1821 |
+
cache_options=None,
|
| 1822 |
+
size=None,
|
| 1823 |
+
**kwargs,
|
| 1824 |
+
):
|
| 1825 |
+
"""
|
| 1826 |
+
Template for files with buffered reading and writing
|
| 1827 |
+
|
| 1828 |
+
Parameters
|
| 1829 |
+
----------
|
| 1830 |
+
fs: instance of FileSystem
|
| 1831 |
+
path: str
|
| 1832 |
+
location in file-system
|
| 1833 |
+
mode: str
|
| 1834 |
+
Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
|
| 1835 |
+
systems may be read-only, and some may not support append.
|
| 1836 |
+
block_size: int
|
| 1837 |
+
Buffer size for reading or writing, 'default' for class default
|
| 1838 |
+
autocommit: bool
|
| 1839 |
+
Whether to write to final destination; may only impact what
|
| 1840 |
+
happens when file is being closed.
|
| 1841 |
+
cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
|
| 1842 |
+
Caching policy in read mode. See the definitions in ``core``.
|
| 1843 |
+
cache_options : dict
|
| 1844 |
+
Additional options passed to the constructor for the cache specified
|
| 1845 |
+
by `cache_type`.
|
| 1846 |
+
size: int
|
| 1847 |
+
If given and in read mode, suppressed having to look up the file size
|
| 1848 |
+
kwargs:
|
| 1849 |
+
Gets stored as self.kwargs
|
| 1850 |
+
"""
|
| 1851 |
+
from .core import caches
|
| 1852 |
+
|
| 1853 |
+
self.path = path
|
| 1854 |
+
self.fs = fs
|
| 1855 |
+
self.mode = mode
|
| 1856 |
+
self.blocksize = (
|
| 1857 |
+
self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
|
| 1858 |
+
)
|
| 1859 |
+
self.loc = 0
|
| 1860 |
+
self.autocommit = autocommit
|
| 1861 |
+
self.end = None
|
| 1862 |
+
self.start = None
|
| 1863 |
+
self.closed = False
|
| 1864 |
+
|
| 1865 |
+
if cache_options is None:
|
| 1866 |
+
cache_options = {}
|
| 1867 |
+
|
| 1868 |
+
if "trim" in kwargs:
|
| 1869 |
+
warnings.warn(
|
| 1870 |
+
"Passing 'trim' to control the cache behavior has been deprecated. "
|
| 1871 |
+
"Specify it within the 'cache_options' argument instead.",
|
| 1872 |
+
FutureWarning,
|
| 1873 |
+
)
|
| 1874 |
+
cache_options["trim"] = kwargs.pop("trim")
|
| 1875 |
+
|
| 1876 |
+
self.kwargs = kwargs
|
| 1877 |
+
|
| 1878 |
+
if mode not in {"ab", "rb", "wb", "xb"}:
|
| 1879 |
+
raise NotImplementedError("File mode not supported")
|
| 1880 |
+
if mode == "rb":
|
| 1881 |
+
if size is not None:
|
| 1882 |
+
self.size = size
|
| 1883 |
+
else:
|
| 1884 |
+
self.size = self.details["size"]
|
| 1885 |
+
self.cache = caches[cache_type](
|
| 1886 |
+
self.blocksize, self._fetch_range, self.size, **cache_options
|
| 1887 |
+
)
|
| 1888 |
+
else:
|
| 1889 |
+
self.buffer = io.BytesIO()
|
| 1890 |
+
self.offset = None
|
| 1891 |
+
self.forced = False
|
| 1892 |
+
self.location = None
|
| 1893 |
+
|
| 1894 |
+
@property
|
| 1895 |
+
def details(self):
|
| 1896 |
+
if self._details is None:
|
| 1897 |
+
self._details = self.fs.info(self.path)
|
| 1898 |
+
return self._details
|
| 1899 |
+
|
| 1900 |
+
@details.setter
|
| 1901 |
+
def details(self, value):
|
| 1902 |
+
self._details = value
|
| 1903 |
+
self.size = value["size"]
|
| 1904 |
+
|
| 1905 |
+
@property
|
| 1906 |
+
def full_name(self):
|
| 1907 |
+
return _unstrip_protocol(self.path, self.fs)
|
| 1908 |
+
|
| 1909 |
+
@property
|
| 1910 |
+
def closed(self):
|
| 1911 |
+
# get around this attr being read-only in IOBase
|
| 1912 |
+
# use getattr here, since this can be called during del
|
| 1913 |
+
return getattr(self, "_closed", True)
|
| 1914 |
+
|
| 1915 |
+
@closed.setter
|
| 1916 |
+
def closed(self, c):
|
| 1917 |
+
self._closed = c
|
| 1918 |
+
|
| 1919 |
+
def __hash__(self):
|
| 1920 |
+
if "w" in self.mode:
|
| 1921 |
+
return id(self)
|
| 1922 |
+
else:
|
| 1923 |
+
return int(tokenize(self.details), 16)
|
| 1924 |
+
|
| 1925 |
+
def __eq__(self, other):
|
| 1926 |
+
"""Files are equal if they have the same checksum, only in read mode"""
|
| 1927 |
+
if self is other:
|
| 1928 |
+
return True
|
| 1929 |
+
return (
|
| 1930 |
+
isinstance(other, type(self))
|
| 1931 |
+
and self.mode == "rb"
|
| 1932 |
+
and other.mode == "rb"
|
| 1933 |
+
and hash(self) == hash(other)
|
| 1934 |
+
)
|
| 1935 |
+
|
| 1936 |
+
def commit(self):
|
| 1937 |
+
"""Move from temp to final destination"""
|
| 1938 |
+
|
| 1939 |
+
def discard(self):
|
| 1940 |
+
"""Throw away temporary file"""
|
| 1941 |
+
|
| 1942 |
+
def info(self):
|
| 1943 |
+
"""File information about this path"""
|
| 1944 |
+
if self.readable():
|
| 1945 |
+
return self.details
|
| 1946 |
+
else:
|
| 1947 |
+
raise ValueError("Info not available while writing")
|
| 1948 |
+
|
| 1949 |
+
def tell(self):
|
| 1950 |
+
"""Current file location"""
|
| 1951 |
+
return self.loc
|
| 1952 |
+
|
| 1953 |
+
def seek(self, loc, whence=0):
|
| 1954 |
+
"""Set current file location
|
| 1955 |
+
|
| 1956 |
+
Parameters
|
| 1957 |
+
----------
|
| 1958 |
+
loc: int
|
| 1959 |
+
byte location
|
| 1960 |
+
whence: {0, 1, 2}
|
| 1961 |
+
from start of file, current location or end of file, resp.
|
| 1962 |
+
"""
|
| 1963 |
+
loc = int(loc)
|
| 1964 |
+
if not self.mode == "rb":
|
| 1965 |
+
raise OSError(ESPIPE, "Seek only available in read mode")
|
| 1966 |
+
if whence == 0:
|
| 1967 |
+
nloc = loc
|
| 1968 |
+
elif whence == 1:
|
| 1969 |
+
nloc = self.loc + loc
|
| 1970 |
+
elif whence == 2:
|
| 1971 |
+
nloc = self.size + loc
|
| 1972 |
+
else:
|
| 1973 |
+
raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
|
| 1974 |
+
if nloc < 0:
|
| 1975 |
+
raise ValueError("Seek before start of file")
|
| 1976 |
+
self.loc = nloc
|
| 1977 |
+
return self.loc
|
| 1978 |
+
|
| 1979 |
+
def write(self, data):
|
| 1980 |
+
"""
|
| 1981 |
+
Write data to buffer.
|
| 1982 |
+
|
| 1983 |
+
Buffer only sent on flush() or if buffer is greater than
|
| 1984 |
+
or equal to blocksize.
|
| 1985 |
+
|
| 1986 |
+
Parameters
|
| 1987 |
+
----------
|
| 1988 |
+
data: bytes
|
| 1989 |
+
Set of bytes to be written.
|
| 1990 |
+
"""
|
| 1991 |
+
if not self.writable():
|
| 1992 |
+
raise ValueError("File not in write mode")
|
| 1993 |
+
if self.closed:
|
| 1994 |
+
raise ValueError("I/O operation on closed file.")
|
| 1995 |
+
if self.forced:
|
| 1996 |
+
raise ValueError("This file has been force-flushed, can only close")
|
| 1997 |
+
out = self.buffer.write(data)
|
| 1998 |
+
self.loc += out
|
| 1999 |
+
if self.buffer.tell() >= self.blocksize:
|
| 2000 |
+
self.flush()
|
| 2001 |
+
return out
|
| 2002 |
+
|
| 2003 |
+
def flush(self, force=False):
|
| 2004 |
+
"""
|
| 2005 |
+
Write buffered data to backend store.
|
| 2006 |
+
|
| 2007 |
+
Writes the current buffer, if it is larger than the block-size, or if
|
| 2008 |
+
the file is being closed.
|
| 2009 |
+
|
| 2010 |
+
Parameters
|
| 2011 |
+
----------
|
| 2012 |
+
force: bool
|
| 2013 |
+
When closing, write the last block even if it is smaller than
|
| 2014 |
+
blocks are allowed to be. Disallows further writing to this file.
|
| 2015 |
+
"""
|
| 2016 |
+
|
| 2017 |
+
if self.closed:
|
| 2018 |
+
raise ValueError("Flush on closed file")
|
| 2019 |
+
if force and self.forced:
|
| 2020 |
+
raise ValueError("Force flush cannot be called more than once")
|
| 2021 |
+
if force:
|
| 2022 |
+
self.forced = True
|
| 2023 |
+
|
| 2024 |
+
if self.readable():
|
| 2025 |
+
# no-op to flush on read-mode
|
| 2026 |
+
return
|
| 2027 |
+
|
| 2028 |
+
if not force and self.buffer.tell() < self.blocksize:
|
| 2029 |
+
# Defer write on small block
|
| 2030 |
+
return
|
| 2031 |
+
|
| 2032 |
+
if self.offset is None:
|
| 2033 |
+
# Initialize a multipart upload
|
| 2034 |
+
self.offset = 0
|
| 2035 |
+
try:
|
| 2036 |
+
self._initiate_upload()
|
| 2037 |
+
except:
|
| 2038 |
+
self.closed = True
|
| 2039 |
+
raise
|
| 2040 |
+
|
| 2041 |
+
if self._upload_chunk(final=force) is not False:
|
| 2042 |
+
self.offset += self.buffer.seek(0, 2)
|
| 2043 |
+
self.buffer = io.BytesIO()
|
| 2044 |
+
|
| 2045 |
+
def _upload_chunk(self, final=False):
|
| 2046 |
+
"""Write one part of a multi-block file upload
|
| 2047 |
+
|
| 2048 |
+
Parameters
|
| 2049 |
+
==========
|
| 2050 |
+
final: bool
|
| 2051 |
+
This is the last block, so should complete file, if
|
| 2052 |
+
self.autocommit is True.
|
| 2053 |
+
"""
|
| 2054 |
+
# may not yet have been initialized, may need to call _initialize_upload
|
| 2055 |
+
|
| 2056 |
+
def _initiate_upload(self):
|
| 2057 |
+
"""Create remote file/upload"""
|
| 2058 |
+
pass
|
| 2059 |
+
|
| 2060 |
+
def _fetch_range(self, start, end):
|
| 2061 |
+
"""Get the specified set of bytes from remote"""
|
| 2062 |
+
return self.fs.cat_file(self.path, start=start, end=end)
|
| 2063 |
+
|
| 2064 |
+
def read(self, length=-1):
|
| 2065 |
+
"""
|
| 2066 |
+
Return data from cache, or fetch pieces as necessary
|
| 2067 |
+
|
| 2068 |
+
Parameters
|
| 2069 |
+
----------
|
| 2070 |
+
length: int (-1)
|
| 2071 |
+
Number of bytes to read; if <0, all remaining bytes.
|
| 2072 |
+
"""
|
| 2073 |
+
length = -1 if length is None else int(length)
|
| 2074 |
+
if self.mode != "rb":
|
| 2075 |
+
raise ValueError("File not in read mode")
|
| 2076 |
+
if length < 0:
|
| 2077 |
+
length = self.size - self.loc
|
| 2078 |
+
if self.closed:
|
| 2079 |
+
raise ValueError("I/O operation on closed file.")
|
| 2080 |
+
if length == 0:
|
| 2081 |
+
# don't even bother calling fetch
|
| 2082 |
+
return b""
|
| 2083 |
+
out = self.cache._fetch(self.loc, self.loc + length)
|
| 2084 |
+
|
| 2085 |
+
logger.debug(
|
| 2086 |
+
"%s read: %i - %i %s",
|
| 2087 |
+
self,
|
| 2088 |
+
self.loc,
|
| 2089 |
+
self.loc + length,
|
| 2090 |
+
self.cache._log_stats(),
|
| 2091 |
+
)
|
| 2092 |
+
self.loc += len(out)
|
| 2093 |
+
return out
|
| 2094 |
+
|
| 2095 |
+
def readinto(self, b):
|
| 2096 |
+
"""mirrors builtin file's readinto method
|
| 2097 |
+
|
| 2098 |
+
https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
|
| 2099 |
+
"""
|
| 2100 |
+
out = memoryview(b).cast("B")
|
| 2101 |
+
data = self.read(out.nbytes)
|
| 2102 |
+
out[: len(data)] = data
|
| 2103 |
+
return len(data)
|
| 2104 |
+
|
| 2105 |
+
def readuntil(self, char=b"\n", blocks=None):
|
| 2106 |
+
"""Return data between current position and first occurrence of char
|
| 2107 |
+
|
| 2108 |
+
char is included in the output, except if the end of the tile is
|
| 2109 |
+
encountered first.
|
| 2110 |
+
|
| 2111 |
+
Parameters
|
| 2112 |
+
----------
|
| 2113 |
+
char: bytes
|
| 2114 |
+
Thing to find
|
| 2115 |
+
blocks: None or int
|
| 2116 |
+
How much to read in each go. Defaults to file blocksize - which may
|
| 2117 |
+
mean a new read on every call.
|
| 2118 |
+
"""
|
| 2119 |
+
out = []
|
| 2120 |
+
while True:
|
| 2121 |
+
start = self.tell()
|
| 2122 |
+
part = self.read(blocks or self.blocksize)
|
| 2123 |
+
if len(part) == 0:
|
| 2124 |
+
break
|
| 2125 |
+
found = part.find(char)
|
| 2126 |
+
if found > -1:
|
| 2127 |
+
out.append(part[: found + len(char)])
|
| 2128 |
+
self.seek(start + found + len(char))
|
| 2129 |
+
break
|
| 2130 |
+
out.append(part)
|
| 2131 |
+
return b"".join(out)
|
| 2132 |
+
|
| 2133 |
+
def readline(self):
|
| 2134 |
+
"""Read until and including the first occurrence of newline character
|
| 2135 |
+
|
| 2136 |
+
Note that, because of character encoding, this is not necessarily a
|
| 2137 |
+
true line ending.
|
| 2138 |
+
"""
|
| 2139 |
+
return self.readuntil(b"\n")
|
| 2140 |
+
|
| 2141 |
+
def __next__(self):
|
| 2142 |
+
out = self.readline()
|
| 2143 |
+
if out:
|
| 2144 |
+
return out
|
| 2145 |
+
raise StopIteration
|
| 2146 |
+
|
| 2147 |
+
def __iter__(self):
|
| 2148 |
+
return self
|
| 2149 |
+
|
| 2150 |
+
def readlines(self):
|
| 2151 |
+
"""Return all data, split by the newline character, including the newline character"""
|
| 2152 |
+
data = self.read()
|
| 2153 |
+
lines = data.split(b"\n")
|
| 2154 |
+
out = [l + b"\n" for l in lines[:-1]]
|
| 2155 |
+
if data.endswith(b"\n"):
|
| 2156 |
+
return out
|
| 2157 |
+
else:
|
| 2158 |
+
return out + [lines[-1]]
|
| 2159 |
+
# return list(self) ???
|
| 2160 |
+
|
| 2161 |
+
def readinto1(self, b):
|
| 2162 |
+
return self.readinto(b)
|
| 2163 |
+
|
| 2164 |
+
def close(self):
|
| 2165 |
+
"""Close file
|
| 2166 |
+
|
| 2167 |
+
Finalizes writes, discards cache
|
| 2168 |
+
"""
|
| 2169 |
+
if getattr(self, "_unclosable", False):
|
| 2170 |
+
return
|
| 2171 |
+
if self.closed:
|
| 2172 |
+
return
|
| 2173 |
+
try:
|
| 2174 |
+
if self.mode == "rb":
|
| 2175 |
+
self.cache = None
|
| 2176 |
+
else:
|
| 2177 |
+
if not self.forced:
|
| 2178 |
+
self.flush(force=True)
|
| 2179 |
+
|
| 2180 |
+
if self.fs is not None:
|
| 2181 |
+
self.fs.invalidate_cache(self.path)
|
| 2182 |
+
self.fs.invalidate_cache(self.fs._parent(self.path))
|
| 2183 |
+
finally:
|
| 2184 |
+
self.closed = True
|
| 2185 |
+
|
| 2186 |
+
def readable(self):
|
| 2187 |
+
"""Whether opened for reading"""
|
| 2188 |
+
return "r" in self.mode and not self.closed
|
| 2189 |
+
|
| 2190 |
+
def seekable(self):
|
| 2191 |
+
"""Whether is seekable (only in read mode)"""
|
| 2192 |
+
return self.readable()
|
| 2193 |
+
|
| 2194 |
+
def writable(self):
|
| 2195 |
+
"""Whether opened for writing"""
|
| 2196 |
+
return self.mode in {"wb", "ab", "xb"} and not self.closed
|
| 2197 |
+
|
| 2198 |
+
def __reduce__(self):
|
| 2199 |
+
if self.mode != "rb":
|
| 2200 |
+
raise RuntimeError("Pickling a writeable file is not supported")
|
| 2201 |
+
|
| 2202 |
+
return reopen, (
|
| 2203 |
+
self.fs,
|
| 2204 |
+
self.path,
|
| 2205 |
+
self.mode,
|
| 2206 |
+
self.blocksize,
|
| 2207 |
+
self.loc,
|
| 2208 |
+
self.size,
|
| 2209 |
+
self.autocommit,
|
| 2210 |
+
self.cache.name if self.cache else "none",
|
| 2211 |
+
self.kwargs,
|
| 2212 |
+
)
|
| 2213 |
+
|
| 2214 |
+
def __del__(self):
|
| 2215 |
+
if not self.closed:
|
| 2216 |
+
self.close()
|
| 2217 |
+
|
| 2218 |
+
def __str__(self):
|
| 2219 |
+
return f"<File-like object {type(self.fs).__name__}, {self.path}>"
|
| 2220 |
+
|
| 2221 |
+
__repr__ = __str__
|
| 2222 |
+
|
| 2223 |
+
def __enter__(self):
|
| 2224 |
+
return self
|
| 2225 |
+
|
| 2226 |
+
def __exit__(self, *args):
|
| 2227 |
+
self.close()
|
| 2228 |
+
|
| 2229 |
+
|
| 2230 |
+
def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
|
| 2231 |
+
file = fs.open(
|
| 2232 |
+
path,
|
| 2233 |
+
mode=mode,
|
| 2234 |
+
block_size=blocksize,
|
| 2235 |
+
autocommit=autocommit,
|
| 2236 |
+
cache_type=cache_type,
|
| 2237 |
+
size=size,
|
| 2238 |
+
**kwargs,
|
| 2239 |
+
)
|
| 2240 |
+
if loc > 0:
|
| 2241 |
+
file.seek(loc)
|
| 2242 |
+
return file
|
.venv/lib/python3.11/site-packages/fsspec/transaction.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import deque
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Transaction:
|
| 5 |
+
"""Filesystem transaction write context
|
| 6 |
+
|
| 7 |
+
Gathers files for deferred commit or discard, so that several write
|
| 8 |
+
operations can be finalized semi-atomically. This works by having this
|
| 9 |
+
instance as the ``.transaction`` attribute of the given filesystem
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, fs, **kwargs):
|
| 13 |
+
"""
|
| 14 |
+
Parameters
|
| 15 |
+
----------
|
| 16 |
+
fs: FileSystem instance
|
| 17 |
+
"""
|
| 18 |
+
self.fs = fs
|
| 19 |
+
self.files = deque()
|
| 20 |
+
|
| 21 |
+
def __enter__(self):
|
| 22 |
+
self.start()
|
| 23 |
+
return self
|
| 24 |
+
|
| 25 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 26 |
+
"""End transaction and commit, if exit is not due to exception"""
|
| 27 |
+
# only commit if there was no exception
|
| 28 |
+
self.complete(commit=exc_type is None)
|
| 29 |
+
if self.fs:
|
| 30 |
+
self.fs._intrans = False
|
| 31 |
+
self.fs._transaction = None
|
| 32 |
+
self.fs = None
|
| 33 |
+
|
| 34 |
+
def start(self):
|
| 35 |
+
"""Start a transaction on this FileSystem"""
|
| 36 |
+
self.files = deque() # clean up after previous failed completions
|
| 37 |
+
self.fs._intrans = True
|
| 38 |
+
|
| 39 |
+
def complete(self, commit=True):
|
| 40 |
+
"""Finish transaction: commit or discard all deferred files"""
|
| 41 |
+
while self.files:
|
| 42 |
+
f = self.files.popleft()
|
| 43 |
+
if commit:
|
| 44 |
+
f.commit()
|
| 45 |
+
else:
|
| 46 |
+
f.discard()
|
| 47 |
+
self.fs._intrans = False
|
| 48 |
+
self.fs._transaction = None
|
| 49 |
+
self.fs = None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class FileActor:
|
| 53 |
+
def __init__(self):
|
| 54 |
+
self.files = []
|
| 55 |
+
|
| 56 |
+
def commit(self):
|
| 57 |
+
for f in self.files:
|
| 58 |
+
f.commit()
|
| 59 |
+
self.files.clear()
|
| 60 |
+
|
| 61 |
+
def discard(self):
|
| 62 |
+
for f in self.files:
|
| 63 |
+
f.discard()
|
| 64 |
+
self.files.clear()
|
| 65 |
+
|
| 66 |
+
def append(self, f):
|
| 67 |
+
self.files.append(f)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class DaskTransaction(Transaction):
|
| 71 |
+
def __init__(self, fs):
|
| 72 |
+
"""
|
| 73 |
+
Parameters
|
| 74 |
+
----------
|
| 75 |
+
fs: FileSystem instance
|
| 76 |
+
"""
|
| 77 |
+
import distributed
|
| 78 |
+
|
| 79 |
+
super().__init__(fs)
|
| 80 |
+
client = distributed.default_client()
|
| 81 |
+
self.files = client.submit(FileActor, actor=True).result()
|
| 82 |
+
|
| 83 |
+
def complete(self, commit=True):
|
| 84 |
+
"""Finish transaction: commit or discard all deferred files"""
|
| 85 |
+
if commit:
|
| 86 |
+
self.files.commit().result()
|
| 87 |
+
else:
|
| 88 |
+
self.files.discard().result()
|
| 89 |
+
self.fs._intrans = False
|
| 90 |
+
self.fs = None
|
.venv/lib/python3.11/site-packages/fsspec/utils.py
ADDED
|
@@ -0,0 +1,739 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import contextlib
|
| 4 |
+
import logging
|
| 5 |
+
import math
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import sys
|
| 9 |
+
import tempfile
|
| 10 |
+
from functools import partial
|
| 11 |
+
from hashlib import md5
|
| 12 |
+
from importlib.metadata import version
|
| 13 |
+
from typing import (
|
| 14 |
+
IO,
|
| 15 |
+
TYPE_CHECKING,
|
| 16 |
+
Any,
|
| 17 |
+
Callable,
|
| 18 |
+
Iterable,
|
| 19 |
+
Iterator,
|
| 20 |
+
Sequence,
|
| 21 |
+
TypeVar,
|
| 22 |
+
)
|
| 23 |
+
from urllib.parse import urlsplit
|
| 24 |
+
|
| 25 |
+
if TYPE_CHECKING:
|
| 26 |
+
import pathlib
|
| 27 |
+
|
| 28 |
+
from typing_extensions import TypeGuard
|
| 29 |
+
|
| 30 |
+
from fsspec.spec import AbstractFileSystem
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
DEFAULT_BLOCK_SIZE = 5 * 2**20
|
| 34 |
+
|
| 35 |
+
T = TypeVar("T")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def infer_storage_options(
|
| 39 |
+
urlpath: str, inherit_storage_options: dict[str, Any] | None = None
|
| 40 |
+
) -> dict[str, Any]:
|
| 41 |
+
"""Infer storage options from URL path and merge it with existing storage
|
| 42 |
+
options.
|
| 43 |
+
|
| 44 |
+
Parameters
|
| 45 |
+
----------
|
| 46 |
+
urlpath: str or unicode
|
| 47 |
+
Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
|
| 48 |
+
inherit_storage_options: dict (optional)
|
| 49 |
+
Its contents will get merged with the inferred information from the
|
| 50 |
+
given path
|
| 51 |
+
|
| 52 |
+
Returns
|
| 53 |
+
-------
|
| 54 |
+
Storage options dict.
|
| 55 |
+
|
| 56 |
+
Examples
|
| 57 |
+
--------
|
| 58 |
+
>>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
|
| 59 |
+
{"protocol": "file", "path", "/mnt/datasets/test.csv"}
|
| 60 |
+
>>> infer_storage_options(
|
| 61 |
+
... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
|
| 62 |
+
... inherit_storage_options={'extra': 'value'},
|
| 63 |
+
... ) # doctest: +SKIP
|
| 64 |
+
{"protocol": "hdfs", "username": "username", "password": "pwd",
|
| 65 |
+
"host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
|
| 66 |
+
"url_query": "q=1", "extra": "value"}
|
| 67 |
+
"""
|
| 68 |
+
# Handle Windows paths including disk name in this special case
|
| 69 |
+
if (
|
| 70 |
+
re.match(r"^[a-zA-Z]:[\\/]", urlpath)
|
| 71 |
+
or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
|
| 72 |
+
):
|
| 73 |
+
return {"protocol": "file", "path": urlpath}
|
| 74 |
+
|
| 75 |
+
parsed_path = urlsplit(urlpath)
|
| 76 |
+
protocol = parsed_path.scheme or "file"
|
| 77 |
+
if parsed_path.fragment:
|
| 78 |
+
path = "#".join([parsed_path.path, parsed_path.fragment])
|
| 79 |
+
else:
|
| 80 |
+
path = parsed_path.path
|
| 81 |
+
if protocol == "file":
|
| 82 |
+
# Special case parsing file protocol URL on Windows according to:
|
| 83 |
+
# https://msdn.microsoft.com/en-us/library/jj710207.aspx
|
| 84 |
+
windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
|
| 85 |
+
if windows_path:
|
| 86 |
+
drive, path = windows_path.groups()
|
| 87 |
+
path = f"{drive}:{path}"
|
| 88 |
+
|
| 89 |
+
if protocol in ["http", "https"]:
|
| 90 |
+
# for HTTP, we don't want to parse, as requests will anyway
|
| 91 |
+
return {"protocol": protocol, "path": urlpath}
|
| 92 |
+
|
| 93 |
+
options: dict[str, Any] = {"protocol": protocol, "path": path}
|
| 94 |
+
|
| 95 |
+
if parsed_path.netloc:
|
| 96 |
+
# Parse `hostname` from netloc manually because `parsed_path.hostname`
|
| 97 |
+
# lowercases the hostname which is not always desirable (e.g. in S3):
|
| 98 |
+
# https://github.com/dask/dask/issues/1417
|
| 99 |
+
options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
|
| 100 |
+
|
| 101 |
+
if protocol in ("s3", "s3a", "gcs", "gs"):
|
| 102 |
+
options["path"] = options["host"] + options["path"]
|
| 103 |
+
else:
|
| 104 |
+
options["host"] = options["host"]
|
| 105 |
+
if parsed_path.port:
|
| 106 |
+
options["port"] = parsed_path.port
|
| 107 |
+
if parsed_path.username:
|
| 108 |
+
options["username"] = parsed_path.username
|
| 109 |
+
if parsed_path.password:
|
| 110 |
+
options["password"] = parsed_path.password
|
| 111 |
+
|
| 112 |
+
if parsed_path.query:
|
| 113 |
+
options["url_query"] = parsed_path.query
|
| 114 |
+
if parsed_path.fragment:
|
| 115 |
+
options["url_fragment"] = parsed_path.fragment
|
| 116 |
+
|
| 117 |
+
if inherit_storage_options:
|
| 118 |
+
update_storage_options(options, inherit_storage_options)
|
| 119 |
+
|
| 120 |
+
return options
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def update_storage_options(
|
| 124 |
+
options: dict[str, Any], inherited: dict[str, Any] | None = None
|
| 125 |
+
) -> None:
|
| 126 |
+
if not inherited:
|
| 127 |
+
inherited = {}
|
| 128 |
+
collisions = set(options) & set(inherited)
|
| 129 |
+
if collisions:
|
| 130 |
+
for collision in collisions:
|
| 131 |
+
if options.get(collision) != inherited.get(collision):
|
| 132 |
+
raise KeyError(
|
| 133 |
+
f"Collision between inferred and specified storage "
|
| 134 |
+
f"option:\n{collision}"
|
| 135 |
+
)
|
| 136 |
+
options.update(inherited)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# Compression extensions registered via fsspec.compression.register_compression
|
| 140 |
+
compressions: dict[str, str] = {}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def infer_compression(filename: str) -> str | None:
|
| 144 |
+
"""Infer compression, if available, from filename.
|
| 145 |
+
|
| 146 |
+
Infer a named compression type, if registered and available, from filename
|
| 147 |
+
extension. This includes builtin (gz, bz2, zip) compressions, as well as
|
| 148 |
+
optional compressions. See fsspec.compression.register_compression.
|
| 149 |
+
"""
|
| 150 |
+
extension = os.path.splitext(filename)[-1].strip(".").lower()
|
| 151 |
+
if extension in compressions:
|
| 152 |
+
return compressions[extension]
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def build_name_function(max_int: float) -> Callable[[int], str]:
|
| 157 |
+
"""Returns a function that receives a single integer
|
| 158 |
+
and returns it as a string padded by enough zero characters
|
| 159 |
+
to align with maximum possible integer
|
| 160 |
+
|
| 161 |
+
>>> name_f = build_name_function(57)
|
| 162 |
+
|
| 163 |
+
>>> name_f(7)
|
| 164 |
+
'07'
|
| 165 |
+
>>> name_f(31)
|
| 166 |
+
'31'
|
| 167 |
+
>>> build_name_function(1000)(42)
|
| 168 |
+
'0042'
|
| 169 |
+
>>> build_name_function(999)(42)
|
| 170 |
+
'042'
|
| 171 |
+
>>> build_name_function(0)(0)
|
| 172 |
+
'0'
|
| 173 |
+
"""
|
| 174 |
+
# handle corner cases max_int is 0 or exact power of 10
|
| 175 |
+
max_int += 1e-8
|
| 176 |
+
|
| 177 |
+
pad_length = int(math.ceil(math.log10(max_int)))
|
| 178 |
+
|
| 179 |
+
def name_function(i: int) -> str:
|
| 180 |
+
return str(i).zfill(pad_length)
|
| 181 |
+
|
| 182 |
+
return name_function
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
|
| 186 |
+
r"""Seek current file to file start, file end, or byte after delimiter seq.
|
| 187 |
+
|
| 188 |
+
Seeks file to next chunk delimiter, where chunks are defined on file start,
|
| 189 |
+
a delimiting sequence, and file end. Use file.tell() to see location afterwards.
|
| 190 |
+
Note that file start is a valid split, so must be at offset > 0 to seek for
|
| 191 |
+
delimiter.
|
| 192 |
+
|
| 193 |
+
Parameters
|
| 194 |
+
----------
|
| 195 |
+
file: a file
|
| 196 |
+
delimiter: bytes
|
| 197 |
+
a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
|
| 198 |
+
blocksize: int
|
| 199 |
+
Number of bytes to read from the file at once.
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
Returns
|
| 203 |
+
-------
|
| 204 |
+
Returns True if a delimiter was found, False if at file start or end.
|
| 205 |
+
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
if file.tell() == 0:
|
| 209 |
+
# beginning-of-file, return without seek
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
# Interface is for binary IO, with delimiter as bytes, but initialize last
|
| 213 |
+
# with result of file.read to preserve compatibility with text IO.
|
| 214 |
+
last: bytes | None = None
|
| 215 |
+
while True:
|
| 216 |
+
current = file.read(blocksize)
|
| 217 |
+
if not current:
|
| 218 |
+
# end-of-file without delimiter
|
| 219 |
+
return False
|
| 220 |
+
full = last + current if last else current
|
| 221 |
+
try:
|
| 222 |
+
if delimiter in full:
|
| 223 |
+
i = full.index(delimiter)
|
| 224 |
+
file.seek(file.tell() - (len(full) - i) + len(delimiter))
|
| 225 |
+
return True
|
| 226 |
+
elif len(current) < blocksize:
|
| 227 |
+
# end-of-file without delimiter
|
| 228 |
+
return False
|
| 229 |
+
except (OSError, ValueError):
|
| 230 |
+
pass
|
| 231 |
+
last = full[-len(delimiter) :]
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def read_block(
|
| 235 |
+
f: IO[bytes],
|
| 236 |
+
offset: int,
|
| 237 |
+
length: int | None,
|
| 238 |
+
delimiter: bytes | None = None,
|
| 239 |
+
split_before: bool = False,
|
| 240 |
+
) -> bytes:
|
| 241 |
+
"""Read a block of bytes from a file
|
| 242 |
+
|
| 243 |
+
Parameters
|
| 244 |
+
----------
|
| 245 |
+
f: File
|
| 246 |
+
Open file
|
| 247 |
+
offset: int
|
| 248 |
+
Byte offset to start read
|
| 249 |
+
length: int
|
| 250 |
+
Number of bytes to read, read through end of file if None
|
| 251 |
+
delimiter: bytes (optional)
|
| 252 |
+
Ensure reading starts and stops at delimiter bytestring
|
| 253 |
+
split_before: bool (optional)
|
| 254 |
+
Start/stop read *before* delimiter bytestring.
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
If using the ``delimiter=`` keyword argument we ensure that the read
|
| 258 |
+
starts and stops at delimiter boundaries that follow the locations
|
| 259 |
+
``offset`` and ``offset + length``. If ``offset`` is zero then we
|
| 260 |
+
start at zero, regardless of delimiter. The bytestring returned WILL
|
| 261 |
+
include the terminating delimiter string.
|
| 262 |
+
|
| 263 |
+
Examples
|
| 264 |
+
--------
|
| 265 |
+
|
| 266 |
+
>>> from io import BytesIO # doctest: +SKIP
|
| 267 |
+
>>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
|
| 268 |
+
>>> read_block(f, 0, 13) # doctest: +SKIP
|
| 269 |
+
b'Alice, 100\\nBo'
|
| 270 |
+
|
| 271 |
+
>>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
|
| 272 |
+
b'Alice, 100\\nBob, 200\\n'
|
| 273 |
+
|
| 274 |
+
>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
|
| 275 |
+
b'Bob, 200\\nCharlie, 300'
|
| 276 |
+
"""
|
| 277 |
+
if delimiter:
|
| 278 |
+
f.seek(offset)
|
| 279 |
+
found_start_delim = seek_delimiter(f, delimiter, 2**16)
|
| 280 |
+
if length is None:
|
| 281 |
+
return f.read()
|
| 282 |
+
start = f.tell()
|
| 283 |
+
length -= start - offset
|
| 284 |
+
|
| 285 |
+
f.seek(start + length)
|
| 286 |
+
found_end_delim = seek_delimiter(f, delimiter, 2**16)
|
| 287 |
+
end = f.tell()
|
| 288 |
+
|
| 289 |
+
# Adjust split location to before delimiter if seek found the
|
| 290 |
+
# delimiter sequence, not start or end of file.
|
| 291 |
+
if found_start_delim and split_before:
|
| 292 |
+
start -= len(delimiter)
|
| 293 |
+
|
| 294 |
+
if found_end_delim and split_before:
|
| 295 |
+
end -= len(delimiter)
|
| 296 |
+
|
| 297 |
+
offset = start
|
| 298 |
+
length = end - start
|
| 299 |
+
|
| 300 |
+
f.seek(offset)
|
| 301 |
+
|
| 302 |
+
# TODO: allow length to be None and read to the end of the file?
|
| 303 |
+
assert length is not None
|
| 304 |
+
b = f.read(length)
|
| 305 |
+
return b
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def tokenize(*args: Any, **kwargs: Any) -> str:
|
| 309 |
+
"""Deterministic token
|
| 310 |
+
|
| 311 |
+
(modified from dask.base)
|
| 312 |
+
|
| 313 |
+
>>> tokenize([1, 2, '3'])
|
| 314 |
+
'9d71491b50023b06fc76928e6eddb952'
|
| 315 |
+
|
| 316 |
+
>>> tokenize('Hello') == tokenize('Hello')
|
| 317 |
+
True
|
| 318 |
+
"""
|
| 319 |
+
if kwargs:
|
| 320 |
+
args += (kwargs,)
|
| 321 |
+
try:
|
| 322 |
+
h = md5(str(args).encode())
|
| 323 |
+
except ValueError:
|
| 324 |
+
# FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
|
| 325 |
+
h = md5(str(args).encode(), usedforsecurity=False)
|
| 326 |
+
return h.hexdigest()
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
|
| 330 |
+
"""Attempt to convert a path-like object to a string.
|
| 331 |
+
|
| 332 |
+
Parameters
|
| 333 |
+
----------
|
| 334 |
+
filepath: object to be converted
|
| 335 |
+
|
| 336 |
+
Returns
|
| 337 |
+
-------
|
| 338 |
+
filepath_str: maybe a string version of the object
|
| 339 |
+
|
| 340 |
+
Notes
|
| 341 |
+
-----
|
| 342 |
+
Objects supporting the fspath protocol are coerced according to its
|
| 343 |
+
__fspath__ method.
|
| 344 |
+
|
| 345 |
+
For backwards compatibility with older Python version, pathlib.Path
|
| 346 |
+
objects are specially coerced.
|
| 347 |
+
|
| 348 |
+
Any other object is passed through unchanged, which includes bytes,
|
| 349 |
+
strings, buffers, or anything else that's not even path-like.
|
| 350 |
+
"""
|
| 351 |
+
if isinstance(filepath, str):
|
| 352 |
+
return filepath
|
| 353 |
+
elif hasattr(filepath, "__fspath__"):
|
| 354 |
+
return filepath.__fspath__()
|
| 355 |
+
elif hasattr(filepath, "path"):
|
| 356 |
+
return filepath.path
|
| 357 |
+
else:
|
| 358 |
+
return filepath # type: ignore[return-value]
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def make_instance(
|
| 362 |
+
cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
|
| 363 |
+
) -> T:
|
| 364 |
+
inst = cls(*args, **kwargs)
|
| 365 |
+
inst._determine_worker() # type: ignore[attr-defined]
|
| 366 |
+
return inst
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def common_prefix(paths: Iterable[str]) -> str:
|
| 370 |
+
"""For a list of paths, find the shortest prefix common to all"""
|
| 371 |
+
parts = [p.split("/") for p in paths]
|
| 372 |
+
lmax = min(len(p) for p in parts)
|
| 373 |
+
end = 0
|
| 374 |
+
for i in range(lmax):
|
| 375 |
+
end = all(p[i] == parts[0][i] for p in parts)
|
| 376 |
+
if not end:
|
| 377 |
+
break
|
| 378 |
+
i += end
|
| 379 |
+
return "/".join(parts[0][:i])
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def other_paths(
|
| 383 |
+
paths: list[str],
|
| 384 |
+
path2: str | list[str],
|
| 385 |
+
exists: bool = False,
|
| 386 |
+
flatten: bool = False,
|
| 387 |
+
) -> list[str]:
|
| 388 |
+
"""In bulk file operations, construct a new file tree from a list of files
|
| 389 |
+
|
| 390 |
+
Parameters
|
| 391 |
+
----------
|
| 392 |
+
paths: list of str
|
| 393 |
+
The input file tree
|
| 394 |
+
path2: str or list of str
|
| 395 |
+
Root to construct the new list in. If this is already a list of str, we just
|
| 396 |
+
assert it has the right number of elements.
|
| 397 |
+
exists: bool (optional)
|
| 398 |
+
For a str destination, it is already exists (and is a dir), files should
|
| 399 |
+
end up inside.
|
| 400 |
+
flatten: bool (optional)
|
| 401 |
+
Whether to flatten the input directory tree structure so that the output files
|
| 402 |
+
are in the same directory.
|
| 403 |
+
|
| 404 |
+
Returns
|
| 405 |
+
-------
|
| 406 |
+
list of str
|
| 407 |
+
"""
|
| 408 |
+
|
| 409 |
+
if isinstance(path2, str):
|
| 410 |
+
path2 = path2.rstrip("/")
|
| 411 |
+
|
| 412 |
+
if flatten:
|
| 413 |
+
path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
|
| 414 |
+
else:
|
| 415 |
+
cp = common_prefix(paths)
|
| 416 |
+
if exists:
|
| 417 |
+
cp = cp.rsplit("/", 1)[0]
|
| 418 |
+
if not cp and all(not s.startswith("/") for s in paths):
|
| 419 |
+
path2 = ["/".join([path2, p]) for p in paths]
|
| 420 |
+
else:
|
| 421 |
+
path2 = [p.replace(cp, path2, 1) for p in paths]
|
| 422 |
+
else:
|
| 423 |
+
assert len(paths) == len(path2)
|
| 424 |
+
return path2
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def is_exception(obj: Any) -> bool:
|
| 428 |
+
return isinstance(obj, BaseException)
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
|
| 432 |
+
return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
def get_protocol(url: str) -> str:
|
| 436 |
+
url = stringify_path(url)
|
| 437 |
+
parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
|
| 438 |
+
if len(parts) > 1:
|
| 439 |
+
return parts[0]
|
| 440 |
+
return "file"
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
def can_be_local(path: str) -> bool:
|
| 444 |
+
"""Can the given URL be used with open_local?"""
|
| 445 |
+
from fsspec import get_filesystem_class
|
| 446 |
+
|
| 447 |
+
try:
|
| 448 |
+
return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
|
| 449 |
+
except (ValueError, ImportError):
|
| 450 |
+
# not in registry or import failed
|
| 451 |
+
return False
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def get_package_version_without_import(name: str) -> str | None:
|
| 455 |
+
"""For given package name, try to find the version without importing it
|
| 456 |
+
|
| 457 |
+
Import and package.__version__ is still the backup here, so an import
|
| 458 |
+
*might* happen.
|
| 459 |
+
|
| 460 |
+
Returns either the version string, or None if the package
|
| 461 |
+
or the version was not readily found.
|
| 462 |
+
"""
|
| 463 |
+
if name in sys.modules:
|
| 464 |
+
mod = sys.modules[name]
|
| 465 |
+
if hasattr(mod, "__version__"):
|
| 466 |
+
return mod.__version__
|
| 467 |
+
try:
|
| 468 |
+
return version(name)
|
| 469 |
+
except: # noqa: E722
|
| 470 |
+
pass
|
| 471 |
+
try:
|
| 472 |
+
import importlib
|
| 473 |
+
|
| 474 |
+
mod = importlib.import_module(name)
|
| 475 |
+
return mod.__version__
|
| 476 |
+
except (ImportError, AttributeError):
|
| 477 |
+
return None
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def setup_logging(
|
| 481 |
+
logger: logging.Logger | None = None,
|
| 482 |
+
logger_name: str | None = None,
|
| 483 |
+
level: str = "DEBUG",
|
| 484 |
+
clear: bool = True,
|
| 485 |
+
) -> logging.Logger:
|
| 486 |
+
if logger is None and logger_name is None:
|
| 487 |
+
raise ValueError("Provide either logger object or logger name")
|
| 488 |
+
logger = logger or logging.getLogger(logger_name)
|
| 489 |
+
handle = logging.StreamHandler()
|
| 490 |
+
formatter = logging.Formatter(
|
| 491 |
+
"%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
|
| 492 |
+
)
|
| 493 |
+
handle.setFormatter(formatter)
|
| 494 |
+
if clear:
|
| 495 |
+
logger.handlers.clear()
|
| 496 |
+
logger.addHandler(handle)
|
| 497 |
+
logger.setLevel(level)
|
| 498 |
+
return logger
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
|
| 502 |
+
return fs.unstrip_protocol(name)
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
def mirror_from(
|
| 506 |
+
origin_name: str, methods: Iterable[str]
|
| 507 |
+
) -> Callable[[type[T]], type[T]]:
|
| 508 |
+
"""Mirror attributes and methods from the given
|
| 509 |
+
origin_name attribute of the instance to the
|
| 510 |
+
decorated class"""
|
| 511 |
+
|
| 512 |
+
def origin_getter(method: str, self: Any) -> Any:
|
| 513 |
+
origin = getattr(self, origin_name)
|
| 514 |
+
return getattr(origin, method)
|
| 515 |
+
|
| 516 |
+
def wrapper(cls: type[T]) -> type[T]:
|
| 517 |
+
for method in methods:
|
| 518 |
+
wrapped_method = partial(origin_getter, method)
|
| 519 |
+
setattr(cls, method, property(wrapped_method))
|
| 520 |
+
return cls
|
| 521 |
+
|
| 522 |
+
return wrapper
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
@contextlib.contextmanager
|
| 526 |
+
def nullcontext(obj: T) -> Iterator[T]:
|
| 527 |
+
yield obj
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
def merge_offset_ranges(
|
| 531 |
+
paths: list[str],
|
| 532 |
+
starts: list[int] | int,
|
| 533 |
+
ends: list[int] | int,
|
| 534 |
+
max_gap: int = 0,
|
| 535 |
+
max_block: int | None = None,
|
| 536 |
+
sort: bool = True,
|
| 537 |
+
) -> tuple[list[str], list[int], list[int]]:
|
| 538 |
+
"""Merge adjacent byte-offset ranges when the inter-range
|
| 539 |
+
gap is <= `max_gap`, and when the merged byte range does not
|
| 540 |
+
exceed `max_block` (if specified). By default, this function
|
| 541 |
+
will re-order the input paths and byte ranges to ensure sorted
|
| 542 |
+
order. If the user can guarantee that the inputs are already
|
| 543 |
+
sorted, passing `sort=False` will skip the re-ordering.
|
| 544 |
+
"""
|
| 545 |
+
# Check input
|
| 546 |
+
if not isinstance(paths, list):
|
| 547 |
+
raise TypeError
|
| 548 |
+
if not isinstance(starts, list):
|
| 549 |
+
starts = [starts] * len(paths)
|
| 550 |
+
if not isinstance(ends, list):
|
| 551 |
+
ends = [ends] * len(paths)
|
| 552 |
+
if len(starts) != len(paths) or len(ends) != len(paths):
|
| 553 |
+
raise ValueError
|
| 554 |
+
|
| 555 |
+
# Early Return
|
| 556 |
+
if len(starts) <= 1:
|
| 557 |
+
return paths, starts, ends
|
| 558 |
+
|
| 559 |
+
starts = [s or 0 for s in starts]
|
| 560 |
+
# Sort by paths and then ranges if `sort=True`
|
| 561 |
+
if sort:
|
| 562 |
+
paths, starts, ends = (
|
| 563 |
+
list(v)
|
| 564 |
+
for v in zip(
|
| 565 |
+
*sorted(
|
| 566 |
+
zip(paths, starts, ends),
|
| 567 |
+
)
|
| 568 |
+
)
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
if paths:
|
| 572 |
+
# Loop through the coupled `paths`, `starts`, and
|
| 573 |
+
# `ends`, and merge adjacent blocks when appropriate
|
| 574 |
+
new_paths = paths[:1]
|
| 575 |
+
new_starts = starts[:1]
|
| 576 |
+
new_ends = ends[:1]
|
| 577 |
+
for i in range(1, len(paths)):
|
| 578 |
+
if paths[i] == paths[i - 1] and new_ends[-1] is None:
|
| 579 |
+
continue
|
| 580 |
+
elif (
|
| 581 |
+
paths[i] != paths[i - 1]
|
| 582 |
+
or ((starts[i] - new_ends[-1]) > max_gap)
|
| 583 |
+
or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
|
| 584 |
+
):
|
| 585 |
+
# Cannot merge with previous block.
|
| 586 |
+
# Add new `paths`, `starts`, and `ends` elements
|
| 587 |
+
new_paths.append(paths[i])
|
| 588 |
+
new_starts.append(starts[i])
|
| 589 |
+
new_ends.append(ends[i])
|
| 590 |
+
else:
|
| 591 |
+
# Merge with previous block by updating the
|
| 592 |
+
# last element of `ends`
|
| 593 |
+
new_ends[-1] = ends[i]
|
| 594 |
+
return new_paths, new_starts, new_ends
|
| 595 |
+
|
| 596 |
+
# `paths` is empty. Just return input lists
|
| 597 |
+
return paths, starts, ends
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def file_size(filelike: IO[bytes]) -> int:
|
| 601 |
+
"""Find length of any open read-mode file-like"""
|
| 602 |
+
pos = filelike.tell()
|
| 603 |
+
try:
|
| 604 |
+
return filelike.seek(0, 2)
|
| 605 |
+
finally:
|
| 606 |
+
filelike.seek(pos)
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
@contextlib.contextmanager
|
| 610 |
+
def atomic_write(path: str, mode: str = "wb"):
|
| 611 |
+
"""
|
| 612 |
+
A context manager that opens a temporary file next to `path` and, on exit,
|
| 613 |
+
replaces `path` with the temporary file, thereby updating `path`
|
| 614 |
+
atomically.
|
| 615 |
+
"""
|
| 616 |
+
fd, fn = tempfile.mkstemp(
|
| 617 |
+
dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
|
| 618 |
+
)
|
| 619 |
+
try:
|
| 620 |
+
with open(fd, mode) as fp:
|
| 621 |
+
yield fp
|
| 622 |
+
except BaseException:
|
| 623 |
+
with contextlib.suppress(FileNotFoundError):
|
| 624 |
+
os.unlink(fn)
|
| 625 |
+
raise
|
| 626 |
+
else:
|
| 627 |
+
os.replace(fn, path)
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
def _translate(pat, STAR, QUESTION_MARK):
|
| 631 |
+
# Copied from: https://github.com/python/cpython/pull/106703.
|
| 632 |
+
res: list[str] = []
|
| 633 |
+
add = res.append
|
| 634 |
+
i, n = 0, len(pat)
|
| 635 |
+
while i < n:
|
| 636 |
+
c = pat[i]
|
| 637 |
+
i = i + 1
|
| 638 |
+
if c == "*":
|
| 639 |
+
# compress consecutive `*` into one
|
| 640 |
+
if (not res) or res[-1] is not STAR:
|
| 641 |
+
add(STAR)
|
| 642 |
+
elif c == "?":
|
| 643 |
+
add(QUESTION_MARK)
|
| 644 |
+
elif c == "[":
|
| 645 |
+
j = i
|
| 646 |
+
if j < n and pat[j] == "!":
|
| 647 |
+
j = j + 1
|
| 648 |
+
if j < n and pat[j] == "]":
|
| 649 |
+
j = j + 1
|
| 650 |
+
while j < n and pat[j] != "]":
|
| 651 |
+
j = j + 1
|
| 652 |
+
if j >= n:
|
| 653 |
+
add("\\[")
|
| 654 |
+
else:
|
| 655 |
+
stuff = pat[i:j]
|
| 656 |
+
if "-" not in stuff:
|
| 657 |
+
stuff = stuff.replace("\\", r"\\")
|
| 658 |
+
else:
|
| 659 |
+
chunks = []
|
| 660 |
+
k = i + 2 if pat[i] == "!" else i + 1
|
| 661 |
+
while True:
|
| 662 |
+
k = pat.find("-", k, j)
|
| 663 |
+
if k < 0:
|
| 664 |
+
break
|
| 665 |
+
chunks.append(pat[i:k])
|
| 666 |
+
i = k + 1
|
| 667 |
+
k = k + 3
|
| 668 |
+
chunk = pat[i:j]
|
| 669 |
+
if chunk:
|
| 670 |
+
chunks.append(chunk)
|
| 671 |
+
else:
|
| 672 |
+
chunks[-1] += "-"
|
| 673 |
+
# Remove empty ranges -- invalid in RE.
|
| 674 |
+
for k in range(len(chunks) - 1, 0, -1):
|
| 675 |
+
if chunks[k - 1][-1] > chunks[k][0]:
|
| 676 |
+
chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
|
| 677 |
+
del chunks[k]
|
| 678 |
+
# Escape backslashes and hyphens for set difference (--).
|
| 679 |
+
# Hyphens that create ranges shouldn't be escaped.
|
| 680 |
+
stuff = "-".join(
|
| 681 |
+
s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
|
| 682 |
+
)
|
| 683 |
+
# Escape set operations (&&, ~~ and ||).
|
| 684 |
+
stuff = re.sub(r"([&~|])", r"\\\1", stuff)
|
| 685 |
+
i = j + 1
|
| 686 |
+
if not stuff:
|
| 687 |
+
# Empty range: never match.
|
| 688 |
+
add("(?!)")
|
| 689 |
+
elif stuff == "!":
|
| 690 |
+
# Negated empty range: match any character.
|
| 691 |
+
add(".")
|
| 692 |
+
else:
|
| 693 |
+
if stuff[0] == "!":
|
| 694 |
+
stuff = "^" + stuff[1:]
|
| 695 |
+
elif stuff[0] in ("^", "["):
|
| 696 |
+
stuff = "\\" + stuff
|
| 697 |
+
add(f"[{stuff}]")
|
| 698 |
+
else:
|
| 699 |
+
add(re.escape(c))
|
| 700 |
+
assert i == n
|
| 701 |
+
return res
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def glob_translate(pat):
|
| 705 |
+
# Copied from: https://github.com/python/cpython/pull/106703.
|
| 706 |
+
# The keyword parameters' values are fixed to:
|
| 707 |
+
# recursive=True, include_hidden=True, seps=None
|
| 708 |
+
"""Translate a pathname with shell wildcards to a regular expression."""
|
| 709 |
+
if os.path.altsep:
|
| 710 |
+
seps = os.path.sep + os.path.altsep
|
| 711 |
+
else:
|
| 712 |
+
seps = os.path.sep
|
| 713 |
+
escaped_seps = "".join(map(re.escape, seps))
|
| 714 |
+
any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
|
| 715 |
+
not_sep = f"[^{escaped_seps}]"
|
| 716 |
+
one_last_segment = f"{not_sep}+"
|
| 717 |
+
one_segment = f"{one_last_segment}{any_sep}"
|
| 718 |
+
any_segments = f"(?:.+{any_sep})?"
|
| 719 |
+
any_last_segments = ".*"
|
| 720 |
+
results = []
|
| 721 |
+
parts = re.split(any_sep, pat)
|
| 722 |
+
last_part_idx = len(parts) - 1
|
| 723 |
+
for idx, part in enumerate(parts):
|
| 724 |
+
if part == "*":
|
| 725 |
+
results.append(one_segment if idx < last_part_idx else one_last_segment)
|
| 726 |
+
continue
|
| 727 |
+
if part == "**":
|
| 728 |
+
results.append(any_segments if idx < last_part_idx else any_last_segments)
|
| 729 |
+
continue
|
| 730 |
+
elif "**" in part:
|
| 731 |
+
raise ValueError(
|
| 732 |
+
"Invalid pattern: '**' can only be an entire path component"
|
| 733 |
+
)
|
| 734 |
+
if part:
|
| 735 |
+
results.extend(_translate(part, f"{not_sep}*", not_sep))
|
| 736 |
+
if idx < last_part_idx:
|
| 737 |
+
results.append(any_sep)
|
| 738 |
+
res = "".join(results)
|
| 739 |
+
return rf"(?s:{res})\Z"
|
.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (907 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/_src/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (187 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file has moved to under torch/_functorch. It is not public API.
|
| 2 |
+
# If you are not a PyTorch developer and you are relying on the following
|
| 3 |
+
# imports, please file an issue.
|
| 4 |
+
from torch._functorch.aot_autograd import (
|
| 5 |
+
aot_autograd_decompositions,
|
| 6 |
+
KNOWN_TYPES,
|
| 7 |
+
PytreeThunk,
|
| 8 |
+
)
|
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (375 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file has moved to under torch/_functorch. It is not public API.
|
| 2 |
+
# If you are not a PyTorch developer and you are relying on the following
|
| 3 |
+
# imports, please file an issue.
|
| 4 |
+
from torch._functorch.eager_transforms import (
|
| 5 |
+
_assert_wrapped_functional,
|
| 6 |
+
_unwrap_functional_tensor,
|
| 7 |
+
)
|
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (364 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file has moved to under torch/_functorch. It is not public API.
|
| 2 |
+
# If you are not a PyTorch developer and you are relying on the following
|
| 3 |
+
# imports, please file an issue.
|
| 4 |
+
from torch._functorch.make_functional import _swap_state
|
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (287 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file has moved to under torch/_functorch. It is not public API.
|
| 2 |
+
# If you are not a PyTorch developer and you are relying on the following
|
| 3 |
+
# imports, please file an issue.
|
| 4 |
+
from torch._functorch.vmap import (
|
| 5 |
+
_add_batch_dim,
|
| 6 |
+
_broadcast_to_and_flatten,
|
| 7 |
+
_create_batched_inputs,
|
| 8 |
+
_get_name,
|
| 9 |
+
_process_batched_inputs,
|
| 10 |
+
_remove_batch_dim,
|
| 11 |
+
_unwrap_batched,
|
| 12 |
+
_validate_and_get_batch_size,
|
| 13 |
+
Tensor,
|
| 14 |
+
tree_flatten,
|
| 15 |
+
tree_unflatten,
|
| 16 |
+
)
|
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (663 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/compile/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch._functorch import config
|
| 2 |
+
from torch._functorch.aot_autograd import (
|
| 3 |
+
aot_function,
|
| 4 |
+
aot_module,
|
| 5 |
+
aot_module_simplified,
|
| 6 |
+
compiled_function,
|
| 7 |
+
compiled_module,
|
| 8 |
+
get_aot_compilation_context,
|
| 9 |
+
get_aot_graph_name,
|
| 10 |
+
get_graph_being_compiled,
|
| 11 |
+
make_boxed_compiler,
|
| 12 |
+
make_boxed_func,
|
| 13 |
+
)
|
| 14 |
+
from torch._functorch.compilers import (
|
| 15 |
+
debug_compile,
|
| 16 |
+
default_decompositions,
|
| 17 |
+
draw_graph_compile,
|
| 18 |
+
memory_efficient_fusion,
|
| 19 |
+
nnc_jit,
|
| 20 |
+
nop,
|
| 21 |
+
print_compile,
|
| 22 |
+
ts_compile,
|
| 23 |
+
)
|
| 24 |
+
from torch._functorch.fx_minifier import minifier
|
| 25 |
+
from torch._functorch.partitioners import (
|
| 26 |
+
default_partition,
|
| 27 |
+
draw_graph,
|
| 28 |
+
min_cut_rematerialization_partition,
|
| 29 |
+
)
|
| 30 |
+
from torch._functorch.python_key import pythonkey_decompose
|
.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.39 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/dim/__init__.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dis
|
| 2 |
+
import inspect
|
| 3 |
+
from typing import Sequence, Union
|
| 4 |
+
|
| 5 |
+
import functorch._C
|
| 6 |
+
import torch
|
| 7 |
+
from functorch._C import dim as _C
|
| 8 |
+
|
| 9 |
+
from .tree_map import tree_flatten, tree_map
|
| 10 |
+
from .wrap_type import wrap_type
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
_C._patch_tensor_class()
|
| 14 |
+
dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DimensionMismatchError(Exception):
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DimensionBindError(Exception):
|
| 22 |
+
pass
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
from . import op_properties
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# use dict to avoid writing C++ bindings for set
|
| 29 |
+
pointwise = dict.fromkeys(op_properties.pointwise, True)
|
| 30 |
+
|
| 31 |
+
use_c = True
|
| 32 |
+
if not use_c:
|
| 33 |
+
from . import reference
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class _Tensor:
|
| 37 |
+
# fast path around slow wrapping/unwrapping logic for simply queries used
|
| 38 |
+
# by the implementation...
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def dims(self):
|
| 42 |
+
return tuple(d for d in self._levels if isinstance(d, Dim))
|
| 43 |
+
|
| 44 |
+
def dim(self):
|
| 45 |
+
return self.ndim
|
| 46 |
+
|
| 47 |
+
if use_c:
|
| 48 |
+
__torch_function__ = classmethod(_C.__torch_function__)
|
| 49 |
+
expand = _C._instancemethod(_C.expand)
|
| 50 |
+
else:
|
| 51 |
+
__torch_function__ = reference.__torch_function__
|
| 52 |
+
expand = reference.expand
|
| 53 |
+
|
| 54 |
+
index = _C._instancemethod(_C.index)
|
| 55 |
+
|
| 56 |
+
def __repr__(self):
|
| 57 |
+
tensor, levels, ndim = self._tensor, self._levels, self.ndim
|
| 58 |
+
return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
TensorLike = (_Tensor, torch.Tensor)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class Dim(_C.Dim, _Tensor):
|
| 65 |
+
# note that _C.Dim comes before tensor because we want the Dim API for things like size to take precendence.
|
| 66 |
+
# Tensor defines format, but we want to print Dims with special formatting
|
| 67 |
+
__format__ = object.__format__
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class Tensor(_Tensor, _C.Tensor):
|
| 71 |
+
if not use_c:
|
| 72 |
+
from_batched = staticmethod(_C.Tensor_from_batched)
|
| 73 |
+
from_positional = staticmethod(_C.Tensor_from_positional)
|
| 74 |
+
sum = _C._instancemethod(_C.Tensor_sum)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def cat(tensors, dim, new_dim):
|
| 78 |
+
n = dims()
|
| 79 |
+
return stack(tensors, n, dim).index([n, dim], new_dim)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if use_c:
|
| 83 |
+
_wrap = _C._wrap
|
| 84 |
+
|
| 85 |
+
def _def(name, *args, **kwargs):
|
| 86 |
+
orig = getattr(torch.Tensor, name)
|
| 87 |
+
setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
|
| 88 |
+
|
| 89 |
+
t__getitem__ = _C._instancemethod(_C.__getitem__)
|
| 90 |
+
stack = _C.stack
|
| 91 |
+
split = _C._instancemethod(_C.split)
|
| 92 |
+
else:
|
| 93 |
+
_wrap, _def = reference._wrap, reference._def
|
| 94 |
+
t__getitem__ = reference.t__getitem__
|
| 95 |
+
stack = reference.stack
|
| 96 |
+
split = reference.split
|
| 97 |
+
|
| 98 |
+
# note: there is no python reference
|
| 99 |
+
t__setitem__ = _C._instancemethod(_C.__setitem__)
|
| 100 |
+
# this is patched in the C API because otherwise torch.Tensor will
|
| 101 |
+
# no longer be considered a sequence and things will break
|
| 102 |
+
# torch.Tensor.__getitem__ = t__getitem__
|
| 103 |
+
|
| 104 |
+
_Tensor.__getitem__ = t__getitem__
|
| 105 |
+
# torch.Tensor.__setitem__ = t__setitem__
|
| 106 |
+
_Tensor.__setitem__ = t__setitem__
|
| 107 |
+
|
| 108 |
+
torch.Tensor.split = split
|
| 109 |
+
_Tensor.split = split
|
| 110 |
+
torch.Tensor.expand = _C._instancemethod(_C.expand)
|
| 111 |
+
torch.Tensor.index = _C._instancemethod(_C.index)
|
| 112 |
+
wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__)
|
| 113 |
+
del _Tensor.ndim
|
| 114 |
+
|
| 115 |
+
if use_c:
|
| 116 |
+
_Tensor.order = _C._instancemethod(_C.order)
|
| 117 |
+
else:
|
| 118 |
+
_Tensor.order = reference.positional
|
| 119 |
+
|
| 120 |
+
_def("mean")
|
| 121 |
+
_def("sum")
|
| 122 |
+
_def("all")
|
| 123 |
+
_def("amax")
|
| 124 |
+
_def("amin")
|
| 125 |
+
_def("aminmax")
|
| 126 |
+
_def("any")
|
| 127 |
+
_def("count_nonzero")
|
| 128 |
+
_def("logsumexp")
|
| 129 |
+
_def("nanmean")
|
| 130 |
+
_def("nansum")
|
| 131 |
+
_def("prod")
|
| 132 |
+
_def("std", keepdim_offset=2)
|
| 133 |
+
_def("var", keepdim_offset=2)
|
| 134 |
+
_def("max", single_dim=True)
|
| 135 |
+
_def("min", single_dim=True)
|
| 136 |
+
_def("argmax", single_dim=True)
|
| 137 |
+
_def("argmin", single_dim=True)
|
| 138 |
+
_def("kthvalue", single_dim=True)
|
| 139 |
+
_def("median", single_dim=True)
|
| 140 |
+
_def("nanmedian", single_dim=True)
|
| 141 |
+
_def("mode", single_dim=True)
|
| 142 |
+
_def("sort", reduce=False)
|
| 143 |
+
_def("argsort", reduce=False)
|
| 144 |
+
_def("unbind", single_dim=True)
|
| 145 |
+
_def("chunk", dim_offset=1, reduce=False)
|
| 146 |
+
_def("cummax", single_dim=True, reduce=False)
|
| 147 |
+
_def("cummin", single_dim=True, reduce=False)
|
| 148 |
+
_def("cumprod", single_dim=True, reduce=False)
|
| 149 |
+
_def("cumprod_", single_dim=True, reduce=False)
|
| 150 |
+
_def("cumsum", single_dim=True, reduce=False)
|
| 151 |
+
_def("cumsum_", single_dim=True, reduce=False)
|
| 152 |
+
_def("logcumsumexp", single_dim=True, reduce=False)
|
| 153 |
+
_def("renorm", dim_offset=1, single_dim=True, reduce=False)
|
| 154 |
+
_def("softmax", single_dim=True, reduce=False)
|
| 155 |
+
softmax = _wrap(torch.nn.functional.softmax, single_dim=True, reduce=False)
|
| 156 |
+
|
| 157 |
+
# stuff to handle in the future, because they require special
|
| 158 |
+
# binding logic for dims
|
| 159 |
+
# cross
|
| 160 |
+
# diag_embed
|
| 161 |
+
# diagonal
|
| 162 |
+
# diagonal_scatter
|
| 163 |
+
# diff
|
| 164 |
+
# nanquantile
|
| 165 |
+
# quantile
|
| 166 |
+
# roll
|
| 167 |
+
# rot90
|
| 168 |
+
# topk (new dimes on output)
|
| 169 |
+
# should these all be subsumed by inplace indexing?
|
| 170 |
+
# index_add_
|
| 171 |
+
# index_add
|
| 172 |
+
# index_copy
|
| 173 |
+
# index_copy_
|
| 174 |
+
# index_fill
|
| 175 |
+
# index_fill_
|
| 176 |
+
# index_select
|
| 177 |
+
# scatter
|
| 178 |
+
# scatter_
|
| 179 |
+
# scatter_add
|
| 180 |
+
# scatter_add_
|
| 181 |
+
# scatter_reduce
|
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (7.86 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc
ADDED
|
Binary file (5.57 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc
ADDED
|
Binary file (7.06 kB). View file
|
|
|