koichi12 commited on
Commit
3914b7f
·
verified ·
1 Parent(s): 2311118

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/fsspec/__init__.py +69 -0
  2. .venv/lib/python3.11/site-packages/fsspec/_version.py +16 -0
  3. .venv/lib/python3.11/site-packages/fsspec/archive.py +75 -0
  4. .venv/lib/python3.11/site-packages/fsspec/asyn.py +1098 -0
  5. .venv/lib/python3.11/site-packages/fsspec/caching.py +966 -0
  6. .venv/lib/python3.11/site-packages/fsspec/callbacks.py +324 -0
  7. .venv/lib/python3.11/site-packages/fsspec/compression.py +175 -0
  8. .venv/lib/python3.11/site-packages/fsspec/config.py +131 -0
  9. .venv/lib/python3.11/site-packages/fsspec/conftest.py +55 -0
  10. .venv/lib/python3.11/site-packages/fsspec/core.py +743 -0
  11. .venv/lib/python3.11/site-packages/fsspec/dircache.py +98 -0
  12. .venv/lib/python3.11/site-packages/fsspec/exceptions.py +18 -0
  13. .venv/lib/python3.11/site-packages/fsspec/fuse.py +324 -0
  14. .venv/lib/python3.11/site-packages/fsspec/generic.py +411 -0
  15. .venv/lib/python3.11/site-packages/fsspec/gui.py +416 -0
  16. .venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py +304 -0
  17. .venv/lib/python3.11/site-packages/fsspec/implementations/dask.py +152 -0
  18. .venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py +467 -0
  19. .venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py +384 -0
  20. .venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py +124 -0
  21. .venv/lib/python3.11/site-packages/fsspec/implementations/local.py +476 -0
  22. .venv/lib/python3.11/site-packages/fsspec/implementations/reference.py +1306 -0
  23. .venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py +180 -0
  24. .venv/lib/python3.11/site-packages/fsspec/implementations/tar.py +124 -0
  25. .venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py +485 -0
  26. .venv/lib/python3.11/site-packages/fsspec/json.py +121 -0
  27. .venv/lib/python3.11/site-packages/fsspec/mapping.py +251 -0
  28. .venv/lib/python3.11/site-packages/fsspec/parquet.py +541 -0
  29. .venv/lib/python3.11/site-packages/fsspec/registry.py +315 -0
  30. .venv/lib/python3.11/site-packages/fsspec/spec.py +2242 -0
  31. .venv/lib/python3.11/site-packages/fsspec/transaction.py +90 -0
  32. .venv/lib/python3.11/site-packages/fsspec/utils.py +739 -0
  33. .venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/functorch/_src/__init__.py +0 -0
  35. .venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc +0 -0
  36. .venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py +8 -0
  37. .venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc +0 -0
  38. .venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py +7 -0
  39. .venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py +4 -0
  41. .venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc +0 -0
  42. .venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py +16 -0
  43. .venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc +0 -0
  44. .venv/lib/python3.11/site-packages/functorch/compile/__init__.py +30 -0
  45. .venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc +0 -0
  46. .venv/lib/python3.11/site-packages/functorch/dim/__init__.py +181 -0
  47. .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/fsspec/__init__.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib.metadata import entry_points
2
+
3
+ from . import caching
4
+ from ._version import __version__ # noqa: F401
5
+ from .callbacks import Callback
6
+ from .compression import available_compressions
7
+ from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
8
+ from .exceptions import FSTimeoutError
9
+ from .mapping import FSMap, get_mapper
10
+ from .registry import (
11
+ available_protocols,
12
+ filesystem,
13
+ get_filesystem_class,
14
+ register_implementation,
15
+ registry,
16
+ )
17
+ from .spec import AbstractFileSystem
18
+
19
+ __all__ = [
20
+ "AbstractFileSystem",
21
+ "FSTimeoutError",
22
+ "FSMap",
23
+ "filesystem",
24
+ "register_implementation",
25
+ "get_filesystem_class",
26
+ "get_fs_token_paths",
27
+ "get_mapper",
28
+ "open",
29
+ "open_files",
30
+ "open_local",
31
+ "registry",
32
+ "caching",
33
+ "Callback",
34
+ "available_protocols",
35
+ "available_compressions",
36
+ "url_to_fs",
37
+ ]
38
+
39
+
40
+ def process_entries():
41
+ if entry_points is not None:
42
+ try:
43
+ eps = entry_points()
44
+ except TypeError:
45
+ pass # importlib-metadata < 0.8
46
+ else:
47
+ if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
48
+ specs = eps.select(group="fsspec.specs")
49
+ else:
50
+ specs = eps.get("fsspec.specs", [])
51
+ registered_names = {}
52
+ for spec in specs:
53
+ err_msg = f"Unable to load filesystem from {spec}"
54
+ name = spec.name
55
+ if name in registered_names:
56
+ continue
57
+ registered_names[name] = True
58
+ register_implementation(
59
+ name,
60
+ spec.value.replace(":", "."),
61
+ errtxt=err_msg,
62
+ # We take our implementations as the ones to overload with if
63
+ # for some reason we encounter some, may be the same, already
64
+ # registered
65
+ clobber=True,
66
+ )
67
+
68
+
69
+ process_entries()
.venv/lib/python3.11/site-packages/fsspec/_version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '2025.2.0'
16
+ __version_tuple__ = version_tuple = (2025, 2, 0)
.venv/lib/python3.11/site-packages/fsspec/archive.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import operator
2
+
3
+ from fsspec import AbstractFileSystem
4
+ from fsspec.utils import tokenize
5
+
6
+
7
+ class AbstractArchiveFileSystem(AbstractFileSystem):
8
+ """
9
+ A generic superclass for implementing Archive-based filesystems.
10
+
11
+ Currently, it is shared amongst
12
+ :class:`~fsspec.implementations.zip.ZipFileSystem`,
13
+ :class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
14
+ :class:`~fsspec.implementations.tar.TarFileSystem`.
15
+ """
16
+
17
+ def __str__(self):
18
+ return f"<Archive-like object {type(self).__name__} at {id(self)}>"
19
+
20
+ __repr__ = __str__
21
+
22
+ def ukey(self, path):
23
+ return tokenize(path, self.fo, self.protocol)
24
+
25
+ def _all_dirnames(self, paths):
26
+ """Returns *all* directory names for each path in paths, including intermediate
27
+ ones.
28
+
29
+ Parameters
30
+ ----------
31
+ paths: Iterable of path strings
32
+ """
33
+ if len(paths) == 0:
34
+ return set()
35
+
36
+ dirnames = {self._parent(path) for path in paths} - {self.root_marker}
37
+ return dirnames | self._all_dirnames(dirnames)
38
+
39
+ def info(self, path, **kwargs):
40
+ self._get_dirs()
41
+ path = self._strip_protocol(path)
42
+ if path in {"", "/"} and self.dir_cache:
43
+ return {"name": "", "type": "directory", "size": 0}
44
+ if path in self.dir_cache:
45
+ return self.dir_cache[path]
46
+ elif path + "/" in self.dir_cache:
47
+ return self.dir_cache[path + "/"]
48
+ else:
49
+ raise FileNotFoundError(path)
50
+
51
+ def ls(self, path, detail=True, **kwargs):
52
+ self._get_dirs()
53
+ paths = {}
54
+ for p, f in self.dir_cache.items():
55
+ p = p.rstrip("/")
56
+ if "/" in p:
57
+ root = p.rsplit("/", 1)[0]
58
+ else:
59
+ root = ""
60
+ if root == path.rstrip("/"):
61
+ paths[p] = f
62
+ elif all(
63
+ (a == b)
64
+ for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
65
+ ):
66
+ # root directory entry
67
+ ppath = p.rstrip("/").split("/", 1)[0]
68
+ if ppath not in paths:
69
+ out = {"name": ppath, "size": 0, "type": "directory"}
70
+ paths[ppath] = out
71
+ if detail:
72
+ out = sorted(paths.values(), key=operator.itemgetter("name"))
73
+ return out
74
+ else:
75
+ return sorted(paths)
.venv/lib/python3.11/site-packages/fsspec/asyn.py ADDED
@@ -0,0 +1,1098 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import asyncio.events
3
+ import functools
4
+ import inspect
5
+ import io
6
+ import numbers
7
+ import os
8
+ import re
9
+ import threading
10
+ from contextlib import contextmanager
11
+ from glob import has_magic
12
+ from typing import TYPE_CHECKING, Iterable
13
+
14
+ from .callbacks import DEFAULT_CALLBACK
15
+ from .exceptions import FSTimeoutError
16
+ from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
17
+ from .spec import AbstractBufferedFile, AbstractFileSystem
18
+ from .utils import glob_translate, is_exception, other_paths
19
+
20
+ private = re.compile("_[^_]")
21
+ iothread = [None] # dedicated fsspec IO thread
22
+ loop = [None] # global event loop for any non-async instance
23
+ _lock = None # global lock placeholder
24
+ get_running_loop = asyncio.get_running_loop
25
+
26
+
27
+ def get_lock():
28
+ """Allocate or return a threading lock.
29
+
30
+ The lock is allocated on first use to allow setting one lock per forked process.
31
+ """
32
+ global _lock
33
+ if not _lock:
34
+ _lock = threading.Lock()
35
+ return _lock
36
+
37
+
38
+ def reset_lock():
39
+ """Reset the global lock.
40
+
41
+ This should be called only on the init of a forked process to reset the lock to
42
+ None, enabling the new forked process to get a new lock.
43
+ """
44
+ global _lock
45
+
46
+ iothread[0] = None
47
+ loop[0] = None
48
+ _lock = None
49
+
50
+
51
+ async def _runner(event, coro, result, timeout=None):
52
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
53
+ if timeout is not None:
54
+ coro = asyncio.wait_for(coro, timeout=timeout)
55
+ try:
56
+ result[0] = await coro
57
+ except Exception as ex:
58
+ result[0] = ex
59
+ finally:
60
+ event.set()
61
+
62
+
63
+ def sync(loop, func, *args, timeout=None, **kwargs):
64
+ """
65
+ Make loop run coroutine until it returns. Runs in other thread
66
+
67
+ Examples
68
+ --------
69
+ >>> fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args,
70
+ timeout=timeout, **kwargs)
71
+ """
72
+ timeout = timeout if timeout else None # convert 0 or 0.0 to None
73
+ # NB: if the loop is not running *yet*, it is OK to submit work
74
+ # and we will wait for it
75
+ if loop is None or loop.is_closed():
76
+ raise RuntimeError("Loop is not running")
77
+ try:
78
+ loop0 = asyncio.events.get_running_loop()
79
+ if loop0 is loop:
80
+ raise NotImplementedError("Calling sync() from within a running loop")
81
+ except NotImplementedError:
82
+ raise
83
+ except RuntimeError:
84
+ pass
85
+ coro = func(*args, **kwargs)
86
+ result = [None]
87
+ event = threading.Event()
88
+ asyncio.run_coroutine_threadsafe(_runner(event, coro, result, timeout), loop)
89
+ while True:
90
+ # this loops allows thread to get interrupted
91
+ if event.wait(1):
92
+ break
93
+ if timeout is not None:
94
+ timeout -= 1
95
+ if timeout < 0:
96
+ raise FSTimeoutError
97
+
98
+ return_result = result[0]
99
+ if isinstance(return_result, asyncio.TimeoutError):
100
+ # suppress asyncio.TimeoutError, raise FSTimeoutError
101
+ raise FSTimeoutError from return_result
102
+ elif isinstance(return_result, BaseException):
103
+ raise return_result
104
+ else:
105
+ return return_result
106
+
107
+
108
+ def sync_wrapper(func, obj=None):
109
+ """Given a function, make so can be called in blocking contexts
110
+
111
+ Leave obj=None if defining within a class. Pass the instance if attaching
112
+ as an attribute of the instance.
113
+ """
114
+
115
+ @functools.wraps(func)
116
+ def wrapper(*args, **kwargs):
117
+ self = obj or args[0]
118
+ return sync(self.loop, func, *args, **kwargs)
119
+
120
+ return wrapper
121
+
122
+
123
+ @contextmanager
124
+ def _selector_policy():
125
+ original_policy = asyncio.get_event_loop_policy()
126
+ try:
127
+ if os.name == "nt" and hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
128
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
129
+
130
+ yield
131
+ finally:
132
+ asyncio.set_event_loop_policy(original_policy)
133
+
134
+
135
+ def get_loop():
136
+ """Create or return the default fsspec IO loop
137
+
138
+ The loop will be running on a separate thread.
139
+ """
140
+ if loop[0] is None:
141
+ with get_lock():
142
+ # repeat the check just in case the loop got filled between the
143
+ # previous two calls from another thread
144
+ if loop[0] is None:
145
+ with _selector_policy():
146
+ loop[0] = asyncio.new_event_loop()
147
+ th = threading.Thread(target=loop[0].run_forever, name="fsspecIO")
148
+ th.daemon = True
149
+ th.start()
150
+ iothread[0] = th
151
+ return loop[0]
152
+
153
+
154
+ if TYPE_CHECKING:
155
+ import resource
156
+
157
+ ResourceError = resource.error
158
+ else:
159
+ try:
160
+ import resource
161
+ except ImportError:
162
+ resource = None
163
+ ResourceError = OSError
164
+ else:
165
+ ResourceError = getattr(resource, "error", OSError)
166
+
167
+ _DEFAULT_BATCH_SIZE = 128
168
+ _NOFILES_DEFAULT_BATCH_SIZE = 1280
169
+
170
+
171
+ def _get_batch_size(nofiles=False):
172
+ from fsspec.config import conf
173
+
174
+ if nofiles:
175
+ if "nofiles_gather_batch_size" in conf:
176
+ return conf["nofiles_gather_batch_size"]
177
+ else:
178
+ if "gather_batch_size" in conf:
179
+ return conf["gather_batch_size"]
180
+ if nofiles:
181
+ return _NOFILES_DEFAULT_BATCH_SIZE
182
+ if resource is None:
183
+ return _DEFAULT_BATCH_SIZE
184
+
185
+ try:
186
+ soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
187
+ except (ImportError, ValueError, ResourceError):
188
+ return _DEFAULT_BATCH_SIZE
189
+
190
+ if soft_limit == resource.RLIM_INFINITY:
191
+ return -1
192
+ else:
193
+ return soft_limit // 8
194
+
195
+
196
+ def running_async() -> bool:
197
+ """Being executed by an event loop?"""
198
+ try:
199
+ asyncio.get_running_loop()
200
+ return True
201
+ except RuntimeError:
202
+ return False
203
+
204
+
205
+ async def _run_coros_in_chunks(
206
+ coros,
207
+ batch_size=None,
208
+ callback=DEFAULT_CALLBACK,
209
+ timeout=None,
210
+ return_exceptions=False,
211
+ nofiles=False,
212
+ ):
213
+ """Run the given coroutines in chunks.
214
+
215
+ Parameters
216
+ ----------
217
+ coros: list of coroutines to run
218
+ batch_size: int or None
219
+ Number of coroutines to submit/wait on simultaneously.
220
+ If -1, then it will not be any throttling. If
221
+ None, it will be inferred from _get_batch_size()
222
+ callback: fsspec.callbacks.Callback instance
223
+ Gets a relative_update when each coroutine completes
224
+ timeout: number or None
225
+ If given, each coroutine times out after this time. Note that, since
226
+ there are multiple batches, the total run time of this function will in
227
+ general be longer
228
+ return_exceptions: bool
229
+ Same meaning as in asyncio.gather
230
+ nofiles: bool
231
+ If inferring the batch_size, does this operation involve local files?
232
+ If yes, you normally expect smaller batches.
233
+ """
234
+
235
+ if batch_size is None:
236
+ batch_size = _get_batch_size(nofiles=nofiles)
237
+
238
+ if batch_size == -1:
239
+ batch_size = len(coros)
240
+
241
+ assert batch_size > 0
242
+
243
+ async def _run_coro(coro, i):
244
+ try:
245
+ return await asyncio.wait_for(coro, timeout=timeout), i
246
+ except Exception as e:
247
+ if not return_exceptions:
248
+ raise
249
+ return e, i
250
+ finally:
251
+ callback.relative_update(1)
252
+
253
+ i = 0
254
+ n = len(coros)
255
+ results = [None] * n
256
+ pending = set()
257
+
258
+ while pending or i < n:
259
+ while len(pending) < batch_size and i < n:
260
+ pending.add(asyncio.ensure_future(_run_coro(coros[i], i)))
261
+ i += 1
262
+
263
+ if not pending:
264
+ break
265
+
266
+ done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
267
+ while done:
268
+ result, k = await done.pop()
269
+ results[k] = result
270
+
271
+ return results
272
+
273
+
274
+ # these methods should be implemented as async by any async-able backend
275
+ async_methods = [
276
+ "_ls",
277
+ "_cat_file",
278
+ "_get_file",
279
+ "_put_file",
280
+ "_rm_file",
281
+ "_cp_file",
282
+ "_pipe_file",
283
+ "_expand_path",
284
+ "_info",
285
+ "_isfile",
286
+ "_isdir",
287
+ "_exists",
288
+ "_walk",
289
+ "_glob",
290
+ "_find",
291
+ "_du",
292
+ "_size",
293
+ "_mkdir",
294
+ "_makedirs",
295
+ ]
296
+
297
+
298
+ class AsyncFileSystem(AbstractFileSystem):
299
+ """Async file operations, default implementations
300
+
301
+ Passes bulk operations to asyncio.gather for concurrent operation.
302
+
303
+ Implementations that have concurrent batch operations and/or async methods
304
+ should inherit from this class instead of AbstractFileSystem. Docstrings are
305
+ copied from the un-underscored method in AbstractFileSystem, if not given.
306
+ """
307
+
308
+ # note that methods do not have docstring here; they will be copied
309
+ # for _* methods and inferred for overridden methods.
310
+
311
+ async_impl = True
312
+ mirror_sync_methods = True
313
+ disable_throttling = False
314
+
315
+ def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
316
+ self.asynchronous = asynchronous
317
+ self._pid = os.getpid()
318
+ if not asynchronous:
319
+ self._loop = loop or get_loop()
320
+ else:
321
+ self._loop = None
322
+ self.batch_size = batch_size
323
+ super().__init__(*args, **kwargs)
324
+
325
+ @property
326
+ def loop(self):
327
+ if self._pid != os.getpid():
328
+ raise RuntimeError("This class is not fork-safe")
329
+ return self._loop
330
+
331
+ async def _rm_file(self, path, **kwargs):
332
+ raise NotImplementedError
333
+
334
+ async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
335
+ # TODO: implement on_error
336
+ batch_size = batch_size or self.batch_size
337
+ path = await self._expand_path(path, recursive=recursive)
338
+ return await _run_coros_in_chunks(
339
+ [self._rm_file(p, **kwargs) for p in reversed(path)],
340
+ batch_size=batch_size,
341
+ nofiles=True,
342
+ )
343
+
344
+ async def _cp_file(self, path1, path2, **kwargs):
345
+ raise NotImplementedError
346
+
347
+ async def _mv_file(self, path1, path2):
348
+ await self._cp_file(path1, path2)
349
+ await self._rm_file(path1)
350
+
351
+ async def _copy(
352
+ self,
353
+ path1,
354
+ path2,
355
+ recursive=False,
356
+ on_error=None,
357
+ maxdepth=None,
358
+ batch_size=None,
359
+ **kwargs,
360
+ ):
361
+ if on_error is None and recursive:
362
+ on_error = "ignore"
363
+ elif on_error is None:
364
+ on_error = "raise"
365
+
366
+ if isinstance(path1, list) and isinstance(path2, list):
367
+ # No need to expand paths when both source and destination
368
+ # are provided as lists
369
+ paths1 = path1
370
+ paths2 = path2
371
+ else:
372
+ source_is_str = isinstance(path1, str)
373
+ paths1 = await self._expand_path(
374
+ path1, maxdepth=maxdepth, recursive=recursive
375
+ )
376
+ if source_is_str and (not recursive or maxdepth is not None):
377
+ # Non-recursive glob does not copy directories
378
+ paths1 = [
379
+ p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
380
+ ]
381
+ if not paths1:
382
+ return
383
+
384
+ source_is_file = len(paths1) == 1
385
+ dest_is_dir = isinstance(path2, str) and (
386
+ trailing_sep(path2) or await self._isdir(path2)
387
+ )
388
+
389
+ exists = source_is_str and (
390
+ (has_magic(path1) and source_is_file)
391
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
392
+ )
393
+ paths2 = other_paths(
394
+ paths1,
395
+ path2,
396
+ exists=exists,
397
+ flatten=not source_is_str,
398
+ )
399
+
400
+ batch_size = batch_size or self.batch_size
401
+ coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
402
+ result = await _run_coros_in_chunks(
403
+ coros, batch_size=batch_size, return_exceptions=True, nofiles=True
404
+ )
405
+
406
+ for ex in filter(is_exception, result):
407
+ if on_error == "ignore" and isinstance(ex, FileNotFoundError):
408
+ continue
409
+ raise ex
410
+
411
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
412
+ raise NotImplementedError
413
+
414
+ async def _pipe(self, path, value=None, batch_size=None, **kwargs):
415
+ if isinstance(path, str):
416
+ path = {path: value}
417
+ batch_size = batch_size or self.batch_size
418
+ return await _run_coros_in_chunks(
419
+ [self._pipe_file(k, v, **kwargs) for k, v in path.items()],
420
+ batch_size=batch_size,
421
+ nofiles=True,
422
+ )
423
+
424
+ async def _process_limits(self, url, start, end):
425
+ """Helper for "Range"-based _cat_file"""
426
+ size = None
427
+ suff = False
428
+ if start is not None and start < 0:
429
+ # if start is negative and end None, end is the "suffix length"
430
+ if end is None:
431
+ end = -start
432
+ start = ""
433
+ suff = True
434
+ else:
435
+ size = size or (await self._info(url))["size"]
436
+ start = size + start
437
+ elif start is None:
438
+ start = 0
439
+ if not suff:
440
+ if end is not None and end < 0:
441
+ if start is not None:
442
+ size = size or (await self._info(url))["size"]
443
+ end = size + end
444
+ elif end is None:
445
+ end = ""
446
+ if isinstance(end, numbers.Integral):
447
+ end -= 1 # bytes range is inclusive
448
+ return f"bytes={start}-{end}"
449
+
450
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
451
+ raise NotImplementedError
452
+
453
+ async def _cat(
454
+ self, path, recursive=False, on_error="raise", batch_size=None, **kwargs
455
+ ):
456
+ paths = await self._expand_path(path, recursive=recursive)
457
+ coros = [self._cat_file(path, **kwargs) for path in paths]
458
+ batch_size = batch_size or self.batch_size
459
+ out = await _run_coros_in_chunks(
460
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
461
+ )
462
+ if on_error == "raise":
463
+ ex = next(filter(is_exception, out), False)
464
+ if ex:
465
+ raise ex
466
+ if (
467
+ len(paths) > 1
468
+ or isinstance(path, list)
469
+ or paths[0] != self._strip_protocol(path)
470
+ ):
471
+ return {
472
+ k: v
473
+ for k, v in zip(paths, out)
474
+ if on_error != "omit" or not is_exception(v)
475
+ }
476
+ else:
477
+ return out[0]
478
+
479
+ async def _cat_ranges(
480
+ self,
481
+ paths,
482
+ starts,
483
+ ends,
484
+ max_gap=None,
485
+ batch_size=None,
486
+ on_error="return",
487
+ **kwargs,
488
+ ):
489
+ """Get the contents of byte ranges from one or more files
490
+
491
+ Parameters
492
+ ----------
493
+ paths: list
494
+ A list of of filepaths on this filesystems
495
+ starts, ends: int or list
496
+ Bytes limits of the read. If using a single int, the same value will be
497
+ used to read all the specified files.
498
+ """
499
+ # TODO: on_error
500
+ if max_gap is not None:
501
+ # use utils.merge_offset_ranges
502
+ raise NotImplementedError
503
+ if not isinstance(paths, list):
504
+ raise TypeError
505
+ if not isinstance(starts, Iterable):
506
+ starts = [starts] * len(paths)
507
+ if not isinstance(ends, Iterable):
508
+ ends = [ends] * len(paths)
509
+ if len(starts) != len(paths) or len(ends) != len(paths):
510
+ raise ValueError
511
+ coros = [
512
+ self._cat_file(p, start=s, end=e, **kwargs)
513
+ for p, s, e in zip(paths, starts, ends)
514
+ ]
515
+ batch_size = batch_size or self.batch_size
516
+ return await _run_coros_in_chunks(
517
+ coros, batch_size=batch_size, nofiles=True, return_exceptions=True
518
+ )
519
+
520
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
521
+ raise NotImplementedError
522
+
523
+ async def _put(
524
+ self,
525
+ lpath,
526
+ rpath,
527
+ recursive=False,
528
+ callback=DEFAULT_CALLBACK,
529
+ batch_size=None,
530
+ maxdepth=None,
531
+ **kwargs,
532
+ ):
533
+ """Copy file(s) from local.
534
+
535
+ Copies a specific file or tree of files (if recursive=True). If rpath
536
+ ends with a "/", it will be assumed to be a directory, and target files
537
+ will go within.
538
+
539
+ The put_file method will be called concurrently on a batch of files. The
540
+ batch_size option can configure the amount of futures that can be executed
541
+ at the same time. If it is -1, then all the files will be uploaded concurrently.
542
+ The default can be set for this instance by passing "batch_size" in the
543
+ constructor, or for all instances by setting the "gather_batch_size" key
544
+ in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
545
+ """
546
+ if isinstance(lpath, list) and isinstance(rpath, list):
547
+ # No need to expand paths when both source and destination
548
+ # are provided as lists
549
+ rpaths = rpath
550
+ lpaths = lpath
551
+ else:
552
+ source_is_str = isinstance(lpath, str)
553
+ if source_is_str:
554
+ lpath = make_path_posix(lpath)
555
+ fs = LocalFileSystem()
556
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
557
+ if source_is_str and (not recursive or maxdepth is not None):
558
+ # Non-recursive glob does not copy directories
559
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
560
+ if not lpaths:
561
+ return
562
+
563
+ source_is_file = len(lpaths) == 1
564
+ dest_is_dir = isinstance(rpath, str) and (
565
+ trailing_sep(rpath) or await self._isdir(rpath)
566
+ )
567
+
568
+ rpath = self._strip_protocol(rpath)
569
+ exists = source_is_str and (
570
+ (has_magic(lpath) and source_is_file)
571
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
572
+ )
573
+ rpaths = other_paths(
574
+ lpaths,
575
+ rpath,
576
+ exists=exists,
577
+ flatten=not source_is_str,
578
+ )
579
+
580
+ is_dir = {l: os.path.isdir(l) for l in lpaths}
581
+ rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
582
+ file_pairs = [(l, r) for l, r in zip(lpaths, rpaths) if not is_dir[l]]
583
+
584
+ await asyncio.gather(*[self._makedirs(d, exist_ok=True) for d in rdirs])
585
+ batch_size = batch_size or self.batch_size
586
+
587
+ coros = []
588
+ callback.set_size(len(file_pairs))
589
+ for lfile, rfile in file_pairs:
590
+ put_file = callback.branch_coro(self._put_file)
591
+ coros.append(put_file(lfile, rfile, **kwargs))
592
+
593
+ return await _run_coros_in_chunks(
594
+ coros, batch_size=batch_size, callback=callback
595
+ )
596
+
597
+ async def _get_file(self, rpath, lpath, **kwargs):
598
+ raise NotImplementedError
599
+
600
+ async def _get(
601
+ self,
602
+ rpath,
603
+ lpath,
604
+ recursive=False,
605
+ callback=DEFAULT_CALLBACK,
606
+ maxdepth=None,
607
+ **kwargs,
608
+ ):
609
+ """Copy file(s) to local.
610
+
611
+ Copies a specific file or tree of files (if recursive=True). If lpath
612
+ ends with a "/", it will be assumed to be a directory, and target files
613
+ will go within. Can submit a list of paths, which may be glob-patterns
614
+ and will be expanded.
615
+
616
+ The get_file method will be called concurrently on a batch of files. The
617
+ batch_size option can configure the amount of futures that can be executed
618
+ at the same time. If it is -1, then all the files will be uploaded concurrently.
619
+ The default can be set for this instance by passing "batch_size" in the
620
+ constructor, or for all instances by setting the "gather_batch_size" key
621
+ in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
622
+ """
623
+ if isinstance(lpath, list) and isinstance(rpath, list):
624
+ # No need to expand paths when both source and destination
625
+ # are provided as lists
626
+ rpaths = rpath
627
+ lpaths = lpath
628
+ else:
629
+ source_is_str = isinstance(rpath, str)
630
+ # First check for rpath trailing slash as _strip_protocol removes it.
631
+ source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
632
+ rpath = self._strip_protocol(rpath)
633
+ rpaths = await self._expand_path(
634
+ rpath, recursive=recursive, maxdepth=maxdepth
635
+ )
636
+ if source_is_str and (not recursive or maxdepth is not None):
637
+ # Non-recursive glob does not copy directories
638
+ rpaths = [
639
+ p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
640
+ ]
641
+ if not rpaths:
642
+ return
643
+
644
+ lpath = make_path_posix(lpath)
645
+ source_is_file = len(rpaths) == 1
646
+ dest_is_dir = isinstance(lpath, str) and (
647
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
648
+ )
649
+
650
+ exists = source_is_str and (
651
+ (has_magic(rpath) and source_is_file)
652
+ or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
653
+ )
654
+ lpaths = other_paths(
655
+ rpaths,
656
+ lpath,
657
+ exists=exists,
658
+ flatten=not source_is_str,
659
+ )
660
+
661
+ [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
662
+ batch_size = kwargs.pop("batch_size", self.batch_size)
663
+
664
+ coros = []
665
+ callback.set_size(len(lpaths))
666
+ for lpath, rpath in zip(lpaths, rpaths):
667
+ get_file = callback.branch_coro(self._get_file)
668
+ coros.append(get_file(rpath, lpath, **kwargs))
669
+ return await _run_coros_in_chunks(
670
+ coros, batch_size=batch_size, callback=callback
671
+ )
672
+
673
+ async def _isfile(self, path):
674
+ try:
675
+ return (await self._info(path))["type"] == "file"
676
+ except: # noqa: E722
677
+ return False
678
+
679
+ async def _isdir(self, path):
680
+ try:
681
+ return (await self._info(path))["type"] == "directory"
682
+ except OSError:
683
+ return False
684
+
685
+ async def _size(self, path):
686
+ return (await self._info(path)).get("size", None)
687
+
688
+ async def _sizes(self, paths, batch_size=None):
689
+ batch_size = batch_size or self.batch_size
690
+ return await _run_coros_in_chunks(
691
+ [self._size(p) for p in paths], batch_size=batch_size
692
+ )
693
+
694
+ async def _exists(self, path, **kwargs):
695
+ try:
696
+ await self._info(path, **kwargs)
697
+ return True
698
+ except FileNotFoundError:
699
+ return False
700
+
701
+ async def _info(self, path, **kwargs):
702
+ raise NotImplementedError
703
+
704
+ async def _ls(self, path, detail=True, **kwargs):
705
+ raise NotImplementedError
706
+
707
+ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
708
+ if maxdepth is not None and maxdepth < 1:
709
+ raise ValueError("maxdepth must be at least 1")
710
+
711
+ path = self._strip_protocol(path)
712
+ full_dirs = {}
713
+ dirs = {}
714
+ files = {}
715
+
716
+ detail = kwargs.pop("detail", False)
717
+ try:
718
+ listing = await self._ls(path, detail=True, **kwargs)
719
+ except (FileNotFoundError, OSError) as e:
720
+ if on_error == "raise":
721
+ raise
722
+ elif callable(on_error):
723
+ on_error(e)
724
+ if detail:
725
+ yield path, {}, {}
726
+ else:
727
+ yield path, [], []
728
+ return
729
+
730
+ for info in listing:
731
+ # each info name must be at least [path]/part , but here
732
+ # we check also for names like [path]/part/
733
+ pathname = info["name"].rstrip("/")
734
+ name = pathname.rsplit("/", 1)[-1]
735
+ if info["type"] == "directory" and pathname != path:
736
+ # do not include "self" path
737
+ full_dirs[name] = pathname
738
+ dirs[name] = info
739
+ elif pathname == path:
740
+ # file-like with same name as give path
741
+ files[""] = info
742
+ else:
743
+ files[name] = info
744
+
745
+ if detail:
746
+ yield path, dirs, files
747
+ else:
748
+ yield path, list(dirs), list(files)
749
+
750
+ if maxdepth is not None:
751
+ maxdepth -= 1
752
+ if maxdepth < 1:
753
+ return
754
+
755
+ for d in dirs:
756
+ async for _ in self._walk(
757
+ full_dirs[d], maxdepth=maxdepth, detail=detail, **kwargs
758
+ ):
759
+ yield _
760
+
761
+ async def _glob(self, path, maxdepth=None, **kwargs):
762
+ if maxdepth is not None and maxdepth < 1:
763
+ raise ValueError("maxdepth must be at least 1")
764
+
765
+ import re
766
+
767
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
768
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
769
+ path = self._strip_protocol(path)
770
+ append_slash_to_dirname = ends_with_sep or path.endswith(
771
+ tuple(sep + "**" for sep in seps)
772
+ )
773
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
774
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
775
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
776
+
777
+ min_idx = min(idx_star, idx_qmark, idx_brace)
778
+
779
+ detail = kwargs.pop("detail", False)
780
+
781
+ if not has_magic(path):
782
+ if await self._exists(path, **kwargs):
783
+ if not detail:
784
+ return [path]
785
+ else:
786
+ return {path: await self._info(path, **kwargs)}
787
+ else:
788
+ if not detail:
789
+ return [] # glob of non-existent returns empty
790
+ else:
791
+ return {}
792
+ elif "/" in path[:min_idx]:
793
+ min_idx = path[:min_idx].rindex("/")
794
+ root = path[: min_idx + 1]
795
+ depth = path[min_idx + 1 :].count("/") + 1
796
+ else:
797
+ root = ""
798
+ depth = path[min_idx + 1 :].count("/") + 1
799
+
800
+ if "**" in path:
801
+ if maxdepth is not None:
802
+ idx_double_stars = path.find("**")
803
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
804
+ depth = depth - depth_double_stars + maxdepth
805
+ else:
806
+ depth = None
807
+
808
+ allpaths = await self._find(
809
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
810
+ )
811
+
812
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
813
+ pattern = re.compile(pattern)
814
+
815
+ out = {
816
+ p: info
817
+ for p, info in sorted(allpaths.items())
818
+ if pattern.match(
819
+ p + "/"
820
+ if append_slash_to_dirname and info["type"] == "directory"
821
+ else p
822
+ )
823
+ }
824
+
825
+ if detail:
826
+ return out
827
+ else:
828
+ return list(out)
829
+
830
+ async def _du(self, path, total=True, maxdepth=None, **kwargs):
831
+ sizes = {}
832
+ # async for?
833
+ for f in await self._find(path, maxdepth=maxdepth, **kwargs):
834
+ info = await self._info(f)
835
+ sizes[info["name"]] = info["size"]
836
+ if total:
837
+ return sum(sizes.values())
838
+ else:
839
+ return sizes
840
+
841
+ async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
842
+ path = self._strip_protocol(path)
843
+ out = {}
844
+ detail = kwargs.pop("detail", False)
845
+
846
+ # Add the root directory if withdirs is requested
847
+ # This is needed for posix glob compliance
848
+ if withdirs and path != "" and await self._isdir(path):
849
+ out[path] = await self._info(path)
850
+
851
+ # async for?
852
+ async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
853
+ if withdirs:
854
+ files.update(dirs)
855
+ out.update({info["name"]: info for name, info in files.items()})
856
+ if not out and (await self._isfile(path)):
857
+ # walk works on directories, but find should also return [path]
858
+ # when path happens to be a file
859
+ out[path] = {}
860
+ names = sorted(out)
861
+ if not detail:
862
+ return names
863
+ else:
864
+ return {name: out[name] for name in names}
865
+
866
+ async def _expand_path(self, path, recursive=False, maxdepth=None):
867
+ if maxdepth is not None and maxdepth < 1:
868
+ raise ValueError("maxdepth must be at least 1")
869
+
870
+ if isinstance(path, str):
871
+ out = await self._expand_path([path], recursive, maxdepth)
872
+ else:
873
+ out = set()
874
+ path = [self._strip_protocol(p) for p in path]
875
+ for p in path: # can gather here
876
+ if has_magic(p):
877
+ bit = set(await self._glob(p, maxdepth=maxdepth))
878
+ out |= bit
879
+ if recursive:
880
+ # glob call above expanded one depth so if maxdepth is defined
881
+ # then decrement it in expand_path call below. If it is zero
882
+ # after decrementing then avoid expand_path call.
883
+ if maxdepth is not None and maxdepth <= 1:
884
+ continue
885
+ out |= set(
886
+ await self._expand_path(
887
+ list(bit),
888
+ recursive=recursive,
889
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
890
+ )
891
+ )
892
+ continue
893
+ elif recursive:
894
+ rec = set(await self._find(p, maxdepth=maxdepth, withdirs=True))
895
+ out |= rec
896
+ if p not in out and (recursive is False or (await self._exists(p))):
897
+ # should only check once, for the root
898
+ out.add(p)
899
+ if not out:
900
+ raise FileNotFoundError(path)
901
+ return sorted(out)
902
+
903
+ async def _mkdir(self, path, create_parents=True, **kwargs):
904
+ pass # not necessary to implement, may not have directories
905
+
906
+ async def _makedirs(self, path, exist_ok=False):
907
+ pass # not necessary to implement, may not have directories
908
+
909
+ async def open_async(self, path, mode="rb", **kwargs):
910
+ if "b" not in mode or kwargs.get("compression"):
911
+ raise ValueError
912
+ raise NotImplementedError
913
+
914
+
915
+ def mirror_sync_methods(obj):
916
+ """Populate sync and async methods for obj
917
+
918
+ For each method will create a sync version if the name refers to an async method
919
+ (coroutine) and there is no override in the child class; will create an async
920
+ method for the corresponding sync method if there is no implementation.
921
+
922
+ Uses the methods specified in
923
+ - async_methods: the set that an implementation is expected to provide
924
+ - default_async_methods: that can be derived from their sync version in
925
+ AbstractFileSystem
926
+ - AsyncFileSystem: async-specific default coroutines
927
+ """
928
+ from fsspec import AbstractFileSystem
929
+
930
+ for method in async_methods + dir(AsyncFileSystem):
931
+ if not method.startswith("_"):
932
+ continue
933
+ smethod = method[1:]
934
+ if private.match(method):
935
+ isco = inspect.iscoroutinefunction(getattr(obj, method, None))
936
+ unsync = getattr(getattr(obj, smethod, False), "__func__", None)
937
+ is_default = unsync is getattr(AbstractFileSystem, smethod, "")
938
+ if isco and is_default:
939
+ mth = sync_wrapper(getattr(obj, method), obj=obj)
940
+ setattr(obj, smethod, mth)
941
+ if not mth.__doc__:
942
+ mth.__doc__ = getattr(
943
+ getattr(AbstractFileSystem, smethod, None), "__doc__", ""
944
+ )
945
+
946
+
947
+ class FSSpecCoroutineCancel(Exception):
948
+ pass
949
+
950
+
951
+ def _dump_running_tasks(
952
+ printout=True, cancel=True, exc=FSSpecCoroutineCancel, with_task=False
953
+ ):
954
+ import traceback
955
+
956
+ tasks = [t for t in asyncio.tasks.all_tasks(loop[0]) if not t.done()]
957
+ if printout:
958
+ [task.print_stack() for task in tasks]
959
+ out = [
960
+ {
961
+ "locals": task._coro.cr_frame.f_locals,
962
+ "file": task._coro.cr_frame.f_code.co_filename,
963
+ "firstline": task._coro.cr_frame.f_code.co_firstlineno,
964
+ "linelo": task._coro.cr_frame.f_lineno,
965
+ "stack": traceback.format_stack(task._coro.cr_frame),
966
+ "task": task if with_task else None,
967
+ }
968
+ for task in tasks
969
+ ]
970
+ if cancel:
971
+ for t in tasks:
972
+ cbs = t._callbacks
973
+ t.cancel()
974
+ asyncio.futures.Future.set_exception(t, exc)
975
+ asyncio.futures.Future.cancel(t)
976
+ [cb[0](t) for cb in cbs] # cancels any dependent concurrent.futures
977
+ try:
978
+ t._coro.throw(exc) # exits coro, unless explicitly handled
979
+ except exc:
980
+ pass
981
+ return out
982
+
983
+
984
+ class AbstractAsyncStreamedFile(AbstractBufferedFile):
985
+ # no read buffering, and always auto-commit
986
+ # TODO: readahead might still be useful here, but needs async version
987
+
988
+ async def read(self, length=-1):
989
+ """
990
+ Return data from cache, or fetch pieces as necessary
991
+
992
+ Parameters
993
+ ----------
994
+ length: int (-1)
995
+ Number of bytes to read; if <0, all remaining bytes.
996
+ """
997
+ length = -1 if length is None else int(length)
998
+ if self.mode != "rb":
999
+ raise ValueError("File not in read mode")
1000
+ if length < 0:
1001
+ length = self.size - self.loc
1002
+ if self.closed:
1003
+ raise ValueError("I/O operation on closed file.")
1004
+ if length == 0:
1005
+ # don't even bother calling fetch
1006
+ return b""
1007
+ out = await self._fetch_range(self.loc, self.loc + length)
1008
+ self.loc += len(out)
1009
+ return out
1010
+
1011
+ async def write(self, data):
1012
+ """
1013
+ Write data to buffer.
1014
+
1015
+ Buffer only sent on flush() or if buffer is greater than
1016
+ or equal to blocksize.
1017
+
1018
+ Parameters
1019
+ ----------
1020
+ data: bytes
1021
+ Set of bytes to be written.
1022
+ """
1023
+ if self.mode not in {"wb", "ab"}:
1024
+ raise ValueError("File not in write mode")
1025
+ if self.closed:
1026
+ raise ValueError("I/O operation on closed file.")
1027
+ if self.forced:
1028
+ raise ValueError("This file has been force-flushed, can only close")
1029
+ out = self.buffer.write(data)
1030
+ self.loc += out
1031
+ if self.buffer.tell() >= self.blocksize:
1032
+ await self.flush()
1033
+ return out
1034
+
1035
+ async def close(self):
1036
+ """Close file
1037
+
1038
+ Finalizes writes, discards cache
1039
+ """
1040
+ if getattr(self, "_unclosable", False):
1041
+ return
1042
+ if self.closed:
1043
+ return
1044
+ if self.mode == "rb":
1045
+ self.cache = None
1046
+ else:
1047
+ if not self.forced:
1048
+ await self.flush(force=True)
1049
+
1050
+ if self.fs is not None:
1051
+ self.fs.invalidate_cache(self.path)
1052
+ self.fs.invalidate_cache(self.fs._parent(self.path))
1053
+
1054
+ self.closed = True
1055
+
1056
+ async def flush(self, force=False):
1057
+ if self.closed:
1058
+ raise ValueError("Flush on closed file")
1059
+ if force and self.forced:
1060
+ raise ValueError("Force flush cannot be called more than once")
1061
+ if force:
1062
+ self.forced = True
1063
+
1064
+ if self.mode not in {"wb", "ab"}:
1065
+ # no-op to flush on read-mode
1066
+ return
1067
+
1068
+ if not force and self.buffer.tell() < self.blocksize:
1069
+ # Defer write on small block
1070
+ return
1071
+
1072
+ if self.offset is None:
1073
+ # Initialize a multipart upload
1074
+ self.offset = 0
1075
+ try:
1076
+ await self._initiate_upload()
1077
+ except:
1078
+ self.closed = True
1079
+ raise
1080
+
1081
+ if await self._upload_chunk(final=force) is not False:
1082
+ self.offset += self.buffer.seek(0, 2)
1083
+ self.buffer = io.BytesIO()
1084
+
1085
+ async def __aenter__(self):
1086
+ return self
1087
+
1088
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1089
+ await self.close()
1090
+
1091
+ async def _fetch_range(self, start, end):
1092
+ raise NotImplementedError
1093
+
1094
+ async def _initiate_upload(self):
1095
+ pass
1096
+
1097
+ async def _upload_chunk(self, final=False):
1098
+ raise NotImplementedError
.venv/lib/python3.11/site-packages/fsspec/caching.py ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import functools
5
+ import logging
6
+ import math
7
+ import os
8
+ import threading
9
+ import warnings
10
+ from concurrent.futures import Future, ThreadPoolExecutor
11
+ from itertools import groupby
12
+ from operator import itemgetter
13
+ from typing import (
14
+ TYPE_CHECKING,
15
+ Any,
16
+ Callable,
17
+ ClassVar,
18
+ Generic,
19
+ NamedTuple,
20
+ Optional,
21
+ OrderedDict,
22
+ TypeVar,
23
+ )
24
+
25
+ if TYPE_CHECKING:
26
+ import mmap
27
+
28
+ from typing_extensions import ParamSpec
29
+
30
+ P = ParamSpec("P")
31
+ else:
32
+ P = TypeVar("P")
33
+
34
+ T = TypeVar("T")
35
+
36
+
37
+ logger = logging.getLogger("fsspec")
38
+
39
+ Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
40
+
41
+
42
+ class BaseCache:
43
+ """Pass-though cache: doesn't keep anything, calls every time
44
+
45
+ Acts as base class for other cachers
46
+
47
+ Parameters
48
+ ----------
49
+ blocksize: int
50
+ How far to read ahead in numbers of bytes
51
+ fetcher: func
52
+ Function of the form f(start, end) which gets bytes from remote as
53
+ specified
54
+ size: int
55
+ How big this file is
56
+ """
57
+
58
+ name: ClassVar[str] = "none"
59
+
60
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
61
+ self.blocksize = blocksize
62
+ self.nblocks = 0
63
+ self.fetcher = fetcher
64
+ self.size = size
65
+ self.hit_count = 0
66
+ self.miss_count = 0
67
+ # the bytes that we actually requested
68
+ self.total_requested_bytes = 0
69
+
70
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
71
+ if start is None:
72
+ start = 0
73
+ if stop is None:
74
+ stop = self.size
75
+ if start >= self.size or start >= stop:
76
+ return b""
77
+ return self.fetcher(start, stop)
78
+
79
+ def _reset_stats(self) -> None:
80
+ """Reset hit and miss counts for a more ganular report e.g. by file."""
81
+ self.hit_count = 0
82
+ self.miss_count = 0
83
+ self.total_requested_bytes = 0
84
+
85
+ def _log_stats(self) -> str:
86
+ """Return a formatted string of the cache statistics."""
87
+ if self.hit_count == 0 and self.miss_count == 0:
88
+ # a cache that does nothing, this is for logs only
89
+ return ""
90
+ return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
91
+
92
+ def __repr__(self) -> str:
93
+ # TODO: use rich for better formatting
94
+ return f"""
95
+ <{self.__class__.__name__}:
96
+ block size : {self.blocksize}
97
+ block count : {self.nblocks}
98
+ file size : {self.size}
99
+ cache hits : {self.hit_count}
100
+ cache misses: {self.miss_count}
101
+ total requested bytes: {self.total_requested_bytes}>
102
+ """
103
+
104
+
105
+ class MMapCache(BaseCache):
106
+ """memory-mapped sparse file cache
107
+
108
+ Opens temporary file, which is filled blocks-wise when data is requested.
109
+ Ensure there is enough disc space in the temporary location.
110
+
111
+ This cache method might only work on posix
112
+ """
113
+
114
+ name = "mmap"
115
+
116
+ def __init__(
117
+ self,
118
+ blocksize: int,
119
+ fetcher: Fetcher,
120
+ size: int,
121
+ location: str | None = None,
122
+ blocks: set[int] | None = None,
123
+ ) -> None:
124
+ super().__init__(blocksize, fetcher, size)
125
+ self.blocks = set() if blocks is None else blocks
126
+ self.location = location
127
+ self.cache = self._makefile()
128
+
129
+ def _makefile(self) -> mmap.mmap | bytearray:
130
+ import mmap
131
+ import tempfile
132
+
133
+ if self.size == 0:
134
+ return bytearray()
135
+
136
+ # posix version
137
+ if self.location is None or not os.path.exists(self.location):
138
+ if self.location is None:
139
+ fd = tempfile.TemporaryFile()
140
+ self.blocks = set()
141
+ else:
142
+ fd = open(self.location, "wb+")
143
+ fd.seek(self.size - 1)
144
+ fd.write(b"1")
145
+ fd.flush()
146
+ else:
147
+ fd = open(self.location, "r+b")
148
+
149
+ return mmap.mmap(fd.fileno(), self.size)
150
+
151
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
152
+ logger.debug(f"MMap cache fetching {start}-{end}")
153
+ if start is None:
154
+ start = 0
155
+ if end is None:
156
+ end = self.size
157
+ if start >= self.size or start >= end:
158
+ return b""
159
+ start_block = start // self.blocksize
160
+ end_block = end // self.blocksize
161
+ block_range = range(start_block, end_block + 1)
162
+ # Determine which blocks need to be fetched. This sequence is sorted by construction.
163
+ need = (i for i in block_range if i not in self.blocks)
164
+ # Count the number of blocks already cached
165
+ self.hit_count += sum(1 for i in block_range if i in self.blocks)
166
+
167
+ # Consolidate needed blocks.
168
+ # Algorithm adapted from Python 2.x itertools documentation.
169
+ # We are grouping an enumerated sequence of blocks. By comparing when the difference
170
+ # between an ascending range (provided by enumerate) and the needed block numbers
171
+ # we can detect when the block number skips values. The key computes this difference.
172
+ # Whenever the difference changes, we know that we have previously cached block(s),
173
+ # and a new group is started. In other words, this algorithm neatly groups
174
+ # runs of consecutive block numbers so they can be fetched together.
175
+ for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
176
+ # Extract the blocks from the enumerated sequence
177
+ _blocks = tuple(map(itemgetter(1), _blocks))
178
+ # Compute start of first block
179
+ sstart = _blocks[0] * self.blocksize
180
+ # Compute the end of the last block. Last block may not be full size.
181
+ send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
182
+
183
+ # Fetch bytes (could be multiple consecutive blocks)
184
+ self.total_requested_bytes += send - sstart
185
+ logger.debug(
186
+ f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
187
+ )
188
+ self.cache[sstart:send] = self.fetcher(sstart, send)
189
+
190
+ # Update set of cached blocks
191
+ self.blocks.update(_blocks)
192
+ # Update cache statistics with number of blocks we had to cache
193
+ self.miss_count += len(_blocks)
194
+
195
+ return self.cache[start:end]
196
+
197
+ def __getstate__(self) -> dict[str, Any]:
198
+ state = self.__dict__.copy()
199
+ # Remove the unpicklable entries.
200
+ del state["cache"]
201
+ return state
202
+
203
+ def __setstate__(self, state: dict[str, Any]) -> None:
204
+ # Restore instance attributes
205
+ self.__dict__.update(state)
206
+ self.cache = self._makefile()
207
+
208
+
209
+ class ReadAheadCache(BaseCache):
210
+ """Cache which reads only when we get beyond a block of data
211
+
212
+ This is a much simpler version of BytesCache, and does not attempt to
213
+ fill holes in the cache or keep fragments alive. It is best suited to
214
+ many small reads in a sequential order (e.g., reading lines from a file).
215
+ """
216
+
217
+ name = "readahead"
218
+
219
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
220
+ super().__init__(blocksize, fetcher, size)
221
+ self.cache = b""
222
+ self.start = 0
223
+ self.end = 0
224
+
225
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
226
+ if start is None:
227
+ start = 0
228
+ if end is None or end > self.size:
229
+ end = self.size
230
+ if start >= self.size or start >= end:
231
+ return b""
232
+ l = end - start
233
+ if start >= self.start and end <= self.end:
234
+ # cache hit
235
+ self.hit_count += 1
236
+ return self.cache[start - self.start : end - self.start]
237
+ elif self.start <= start < self.end:
238
+ # partial hit
239
+ self.miss_count += 1
240
+ part = self.cache[start - self.start :]
241
+ l -= len(part)
242
+ start = self.end
243
+ else:
244
+ # miss
245
+ self.miss_count += 1
246
+ part = b""
247
+ end = min(self.size, end + self.blocksize)
248
+ self.total_requested_bytes += end - start
249
+ self.cache = self.fetcher(start, end) # new block replaces old
250
+ self.start = start
251
+ self.end = self.start + len(self.cache)
252
+ return part + self.cache[:l]
253
+
254
+
255
+ class FirstChunkCache(BaseCache):
256
+ """Caches the first block of a file only
257
+
258
+ This may be useful for file types where the metadata is stored in the header,
259
+ but is randomly accessed.
260
+ """
261
+
262
+ name = "first"
263
+
264
+ def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
265
+ if blocksize > size:
266
+ # this will buffer the whole thing
267
+ blocksize = size
268
+ super().__init__(blocksize, fetcher, size)
269
+ self.cache: bytes | None = None
270
+
271
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
272
+ start = start or 0
273
+ if start > self.size:
274
+ logger.debug("FirstChunkCache: requested start > file size")
275
+ return b""
276
+
277
+ end = min(end, self.size)
278
+
279
+ if start < self.blocksize:
280
+ if self.cache is None:
281
+ self.miss_count += 1
282
+ if end > self.blocksize:
283
+ self.total_requested_bytes += end
284
+ data = self.fetcher(0, end)
285
+ self.cache = data[: self.blocksize]
286
+ return data[start:]
287
+ self.cache = self.fetcher(0, self.blocksize)
288
+ self.total_requested_bytes += self.blocksize
289
+ part = self.cache[start:end]
290
+ if end > self.blocksize:
291
+ self.total_requested_bytes += end - self.blocksize
292
+ part += self.fetcher(self.blocksize, end)
293
+ self.hit_count += 1
294
+ return part
295
+ else:
296
+ self.miss_count += 1
297
+ self.total_requested_bytes += end - start
298
+ return self.fetcher(start, end)
299
+
300
+
301
+ class BlockCache(BaseCache):
302
+ """
303
+ Cache holding memory as a set of blocks.
304
+
305
+ Requests are only ever made ``blocksize`` at a time, and are
306
+ stored in an LRU cache. The least recently accessed block is
307
+ discarded when more than ``maxblocks`` are stored.
308
+
309
+ Parameters
310
+ ----------
311
+ blocksize : int
312
+ The number of bytes to store in each block.
313
+ Requests are only ever made for ``blocksize``, so this
314
+ should balance the overhead of making a request against
315
+ the granularity of the blocks.
316
+ fetcher : Callable
317
+ size : int
318
+ The total size of the file being cached.
319
+ maxblocks : int
320
+ The maximum number of blocks to cache for. The maximum memory
321
+ use for this cache is then ``blocksize * maxblocks``.
322
+ """
323
+
324
+ name = "blockcache"
325
+
326
+ def __init__(
327
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
328
+ ) -> None:
329
+ super().__init__(blocksize, fetcher, size)
330
+ self.nblocks = math.ceil(size / blocksize)
331
+ self.maxblocks = maxblocks
332
+ self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
333
+
334
+ def cache_info(self):
335
+ """
336
+ The statistics on the block cache.
337
+
338
+ Returns
339
+ -------
340
+ NamedTuple
341
+ Returned directly from the LRU Cache used internally.
342
+ """
343
+ return self._fetch_block_cached.cache_info()
344
+
345
+ def __getstate__(self) -> dict[str, Any]:
346
+ state = self.__dict__
347
+ del state["_fetch_block_cached"]
348
+ return state
349
+
350
+ def __setstate__(self, state: dict[str, Any]) -> None:
351
+ self.__dict__.update(state)
352
+ self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
353
+ self._fetch_block
354
+ )
355
+
356
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
357
+ if start is None:
358
+ start = 0
359
+ if end is None:
360
+ end = self.size
361
+ if start >= self.size or start >= end:
362
+ return b""
363
+
364
+ # byte position -> block numbers
365
+ start_block_number = start // self.blocksize
366
+ end_block_number = end // self.blocksize
367
+
368
+ # these are cached, so safe to do multiple calls for the same start and end.
369
+ for block_number in range(start_block_number, end_block_number + 1):
370
+ self._fetch_block_cached(block_number)
371
+
372
+ return self._read_cache(
373
+ start,
374
+ end,
375
+ start_block_number=start_block_number,
376
+ end_block_number=end_block_number,
377
+ )
378
+
379
+ def _fetch_block(self, block_number: int) -> bytes:
380
+ """
381
+ Fetch the block of data for `block_number`.
382
+ """
383
+ if block_number > self.nblocks:
384
+ raise ValueError(
385
+ f"'block_number={block_number}' is greater than "
386
+ f"the number of blocks ({self.nblocks})"
387
+ )
388
+
389
+ start = block_number * self.blocksize
390
+ end = start + self.blocksize
391
+ self.total_requested_bytes += end - start
392
+ self.miss_count += 1
393
+ logger.info("BlockCache fetching block %d", block_number)
394
+ block_contents = super()._fetch(start, end)
395
+ return block_contents
396
+
397
+ def _read_cache(
398
+ self, start: int, end: int, start_block_number: int, end_block_number: int
399
+ ) -> bytes:
400
+ """
401
+ Read from our block cache.
402
+
403
+ Parameters
404
+ ----------
405
+ start, end : int
406
+ The start and end byte positions.
407
+ start_block_number, end_block_number : int
408
+ The start and end block numbers.
409
+ """
410
+ start_pos = start % self.blocksize
411
+ end_pos = end % self.blocksize
412
+
413
+ self.hit_count += 1
414
+ if start_block_number == end_block_number:
415
+ block: bytes = self._fetch_block_cached(start_block_number)
416
+ return block[start_pos:end_pos]
417
+
418
+ else:
419
+ # read from the initial
420
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
421
+
422
+ # intermediate blocks
423
+ # Note: it'd be nice to combine these into one big request. However
424
+ # that doesn't play nicely with our LRU cache.
425
+ out.extend(
426
+ map(
427
+ self._fetch_block_cached,
428
+ range(start_block_number + 1, end_block_number),
429
+ )
430
+ )
431
+
432
+ # final block
433
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
434
+
435
+ return b"".join(out)
436
+
437
+
438
+ class BytesCache(BaseCache):
439
+ """Cache which holds data in a in-memory bytes object
440
+
441
+ Implements read-ahead by the block size, for semi-random reads progressing
442
+ through the file.
443
+
444
+ Parameters
445
+ ----------
446
+ trim: bool
447
+ As we read more data, whether to discard the start of the buffer when
448
+ we are more than a blocksize ahead of it.
449
+ """
450
+
451
+ name: ClassVar[str] = "bytes"
452
+
453
+ def __init__(
454
+ self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
455
+ ) -> None:
456
+ super().__init__(blocksize, fetcher, size)
457
+ self.cache = b""
458
+ self.start: int | None = None
459
+ self.end: int | None = None
460
+ self.trim = trim
461
+
462
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
463
+ # TODO: only set start/end after fetch, in case it fails?
464
+ # is this where retry logic might go?
465
+ if start is None:
466
+ start = 0
467
+ if end is None:
468
+ end = self.size
469
+ if start >= self.size or start >= end:
470
+ return b""
471
+ if (
472
+ self.start is not None
473
+ and start >= self.start
474
+ and self.end is not None
475
+ and end < self.end
476
+ ):
477
+ # cache hit: we have all the required data
478
+ offset = start - self.start
479
+ self.hit_count += 1
480
+ return self.cache[offset : offset + end - start]
481
+
482
+ if self.blocksize:
483
+ bend = min(self.size, end + self.blocksize)
484
+ else:
485
+ bend = end
486
+
487
+ if bend == start or start > self.size:
488
+ return b""
489
+
490
+ if (self.start is None or start < self.start) and (
491
+ self.end is None or end > self.end
492
+ ):
493
+ # First read, or extending both before and after
494
+ self.total_requested_bytes += bend - start
495
+ self.miss_count += 1
496
+ self.cache = self.fetcher(start, bend)
497
+ self.start = start
498
+ else:
499
+ assert self.start is not None
500
+ assert self.end is not None
501
+ self.miss_count += 1
502
+
503
+ if start < self.start:
504
+ if self.end is None or self.end - end > self.blocksize:
505
+ self.total_requested_bytes += bend - start
506
+ self.cache = self.fetcher(start, bend)
507
+ self.start = start
508
+ else:
509
+ self.total_requested_bytes += self.start - start
510
+ new = self.fetcher(start, self.start)
511
+ self.start = start
512
+ self.cache = new + self.cache
513
+ elif self.end is not None and bend > self.end:
514
+ if self.end > self.size:
515
+ pass
516
+ elif end - self.end > self.blocksize:
517
+ self.total_requested_bytes += bend - start
518
+ self.cache = self.fetcher(start, bend)
519
+ self.start = start
520
+ else:
521
+ self.total_requested_bytes += bend - self.end
522
+ new = self.fetcher(self.end, bend)
523
+ self.cache = self.cache + new
524
+
525
+ self.end = self.start + len(self.cache)
526
+ offset = start - self.start
527
+ out = self.cache[offset : offset + end - start]
528
+ if self.trim:
529
+ num = (self.end - self.start) // (self.blocksize + 1)
530
+ if num > 1:
531
+ self.start += self.blocksize * num
532
+ self.cache = self.cache[self.blocksize * num :]
533
+ return out
534
+
535
+ def __len__(self) -> int:
536
+ return len(self.cache)
537
+
538
+
539
+ class AllBytes(BaseCache):
540
+ """Cache entire contents of the file"""
541
+
542
+ name: ClassVar[str] = "all"
543
+
544
+ def __init__(
545
+ self,
546
+ blocksize: int | None = None,
547
+ fetcher: Fetcher | None = None,
548
+ size: int | None = None,
549
+ data: bytes | None = None,
550
+ ) -> None:
551
+ super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
552
+ if data is None:
553
+ self.miss_count += 1
554
+ self.total_requested_bytes += self.size
555
+ data = self.fetcher(0, self.size)
556
+ self.data = data
557
+
558
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
559
+ self.hit_count += 1
560
+ return self.data[start:stop]
561
+
562
+
563
+ class KnownPartsOfAFile(BaseCache):
564
+ """
565
+ Cache holding known file parts.
566
+
567
+ Parameters
568
+ ----------
569
+ blocksize: int
570
+ How far to read ahead in numbers of bytes
571
+ fetcher: func
572
+ Function of the form f(start, end) which gets bytes from remote as
573
+ specified
574
+ size: int
575
+ How big this file is
576
+ data: dict
577
+ A dictionary mapping explicit `(start, stop)` file-offset tuples
578
+ with known bytes.
579
+ strict: bool, default True
580
+ Whether to fetch reads that go beyond a known byte-range boundary.
581
+ If `False`, any read that ends outside a known part will be zero
582
+ padded. Note that zero padding will not be used for reads that
583
+ begin outside a known byte-range.
584
+ """
585
+
586
+ name: ClassVar[str] = "parts"
587
+
588
+ def __init__(
589
+ self,
590
+ blocksize: int,
591
+ fetcher: Fetcher,
592
+ size: int,
593
+ data: Optional[dict[tuple[int, int], bytes]] = None,
594
+ strict: bool = True,
595
+ **_: Any,
596
+ ):
597
+ super().__init__(blocksize, fetcher, size)
598
+ self.strict = strict
599
+
600
+ # simple consolidation of contiguous blocks
601
+ if data:
602
+ old_offsets = sorted(data.keys())
603
+ offsets = [old_offsets[0]]
604
+ blocks = [data.pop(old_offsets[0])]
605
+ for start, stop in old_offsets[1:]:
606
+ start0, stop0 = offsets[-1]
607
+ if start == stop0:
608
+ offsets[-1] = (start0, stop)
609
+ blocks[-1] += data.pop((start, stop))
610
+ else:
611
+ offsets.append((start, stop))
612
+ blocks.append(data.pop((start, stop)))
613
+
614
+ self.data = dict(zip(offsets, blocks))
615
+ else:
616
+ self.data = {}
617
+
618
+ def _fetch(self, start: int | None, stop: int | None) -> bytes:
619
+ if start is None:
620
+ start = 0
621
+ if stop is None:
622
+ stop = self.size
623
+
624
+ out = b""
625
+ for (loc0, loc1), data in self.data.items():
626
+ # If self.strict=False, use zero-padded data
627
+ # for reads beyond the end of a "known" buffer
628
+ if loc0 <= start < loc1:
629
+ off = start - loc0
630
+ out = data[off : off + stop - start]
631
+ if not self.strict or loc0 <= stop <= loc1:
632
+ # The request is within a known range, or
633
+ # it begins within a known range, and we
634
+ # are allowed to pad reads beyond the
635
+ # buffer with zero
636
+ out += b"\x00" * (stop - start - len(out))
637
+ self.hit_count += 1
638
+ return out
639
+ else:
640
+ # The request ends outside a known range,
641
+ # and we are being "strict" about reads
642
+ # beyond the buffer
643
+ start = loc1
644
+ break
645
+
646
+ # We only get here if there is a request outside the
647
+ # known parts of the file. In an ideal world, this
648
+ # should never happen
649
+ if self.fetcher is None:
650
+ # We cannot fetch the data, so raise an error
651
+ raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
652
+ # We can fetch the data, but should warn the user
653
+ # that this may be slow
654
+ warnings.warn(
655
+ f"Read is outside the known file parts: {(start, stop)}. "
656
+ f"IO/caching performance may be poor!"
657
+ )
658
+ logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
659
+ self.total_requested_bytes += stop - start
660
+ self.miss_count += 1
661
+ return out + super()._fetch(start, stop)
662
+
663
+
664
+ class UpdatableLRU(Generic[P, T]):
665
+ """
666
+ Custom implementation of LRU cache that allows updating keys
667
+
668
+ Used by BackgroudBlockCache
669
+ """
670
+
671
+ class CacheInfo(NamedTuple):
672
+ hits: int
673
+ misses: int
674
+ maxsize: int
675
+ currsize: int
676
+
677
+ def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
678
+ self._cache: OrderedDict[Any, T] = collections.OrderedDict()
679
+ self._func = func
680
+ self._max_size = max_size
681
+ self._hits = 0
682
+ self._misses = 0
683
+ self._lock = threading.Lock()
684
+
685
+ def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
686
+ if kwargs:
687
+ raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
688
+ with self._lock:
689
+ if args in self._cache:
690
+ self._cache.move_to_end(args)
691
+ self._hits += 1
692
+ return self._cache[args]
693
+
694
+ result = self._func(*args, **kwargs)
695
+
696
+ with self._lock:
697
+ self._cache[args] = result
698
+ self._misses += 1
699
+ if len(self._cache) > self._max_size:
700
+ self._cache.popitem(last=False)
701
+
702
+ return result
703
+
704
+ def is_key_cached(self, *args: Any) -> bool:
705
+ with self._lock:
706
+ return args in self._cache
707
+
708
+ def add_key(self, result: T, *args: Any) -> None:
709
+ with self._lock:
710
+ self._cache[args] = result
711
+ if len(self._cache) > self._max_size:
712
+ self._cache.popitem(last=False)
713
+
714
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
715
+ with self._lock:
716
+ return self.CacheInfo(
717
+ maxsize=self._max_size,
718
+ currsize=len(self._cache),
719
+ hits=self._hits,
720
+ misses=self._misses,
721
+ )
722
+
723
+
724
+ class BackgroundBlockCache(BaseCache):
725
+ """
726
+ Cache holding memory as a set of blocks with pre-loading of
727
+ the next block in the background.
728
+
729
+ Requests are only ever made ``blocksize`` at a time, and are
730
+ stored in an LRU cache. The least recently accessed block is
731
+ discarded when more than ``maxblocks`` are stored. If the
732
+ next block is not in cache, it is loaded in a separate thread
733
+ in non-blocking way.
734
+
735
+ Parameters
736
+ ----------
737
+ blocksize : int
738
+ The number of bytes to store in each block.
739
+ Requests are only ever made for ``blocksize``, so this
740
+ should balance the overhead of making a request against
741
+ the granularity of the blocks.
742
+ fetcher : Callable
743
+ size : int
744
+ The total size of the file being cached.
745
+ maxblocks : int
746
+ The maximum number of blocks to cache for. The maximum memory
747
+ use for this cache is then ``blocksize * maxblocks``.
748
+ """
749
+
750
+ name: ClassVar[str] = "background"
751
+
752
+ def __init__(
753
+ self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
754
+ ) -> None:
755
+ super().__init__(blocksize, fetcher, size)
756
+ self.nblocks = math.ceil(size / blocksize)
757
+ self.maxblocks = maxblocks
758
+ self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
759
+
760
+ self._thread_executor = ThreadPoolExecutor(max_workers=1)
761
+ self._fetch_future_block_number: int | None = None
762
+ self._fetch_future: Future[bytes] | None = None
763
+ self._fetch_future_lock = threading.Lock()
764
+
765
+ def cache_info(self) -> UpdatableLRU.CacheInfo:
766
+ """
767
+ The statistics on the block cache.
768
+
769
+ Returns
770
+ -------
771
+ NamedTuple
772
+ Returned directly from the LRU Cache used internally.
773
+ """
774
+ return self._fetch_block_cached.cache_info()
775
+
776
+ def __getstate__(self) -> dict[str, Any]:
777
+ state = self.__dict__
778
+ del state["_fetch_block_cached"]
779
+ del state["_thread_executor"]
780
+ del state["_fetch_future_block_number"]
781
+ del state["_fetch_future"]
782
+ del state["_fetch_future_lock"]
783
+ return state
784
+
785
+ def __setstate__(self, state) -> None:
786
+ self.__dict__.update(state)
787
+ self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
788
+ self._thread_executor = ThreadPoolExecutor(max_workers=1)
789
+ self._fetch_future_block_number = None
790
+ self._fetch_future = None
791
+ self._fetch_future_lock = threading.Lock()
792
+
793
+ def _fetch(self, start: int | None, end: int | None) -> bytes:
794
+ if start is None:
795
+ start = 0
796
+ if end is None:
797
+ end = self.size
798
+ if start >= self.size or start >= end:
799
+ return b""
800
+
801
+ # byte position -> block numbers
802
+ start_block_number = start // self.blocksize
803
+ end_block_number = end // self.blocksize
804
+
805
+ fetch_future_block_number = None
806
+ fetch_future = None
807
+ with self._fetch_future_lock:
808
+ # Background thread is running. Check we we can or must join it.
809
+ if self._fetch_future is not None:
810
+ assert self._fetch_future_block_number is not None
811
+ if self._fetch_future.done():
812
+ logger.info("BlockCache joined background fetch without waiting.")
813
+ self._fetch_block_cached.add_key(
814
+ self._fetch_future.result(), self._fetch_future_block_number
815
+ )
816
+ # Cleanup the fetch variables. Done with fetching the block.
817
+ self._fetch_future_block_number = None
818
+ self._fetch_future = None
819
+ else:
820
+ # Must join if we need the block for the current fetch
821
+ must_join = bool(
822
+ start_block_number
823
+ <= self._fetch_future_block_number
824
+ <= end_block_number
825
+ )
826
+ if must_join:
827
+ # Copy to the local variables to release lock
828
+ # before waiting for result
829
+ fetch_future_block_number = self._fetch_future_block_number
830
+ fetch_future = self._fetch_future
831
+
832
+ # Cleanup the fetch variables. Have a local copy.
833
+ self._fetch_future_block_number = None
834
+ self._fetch_future = None
835
+
836
+ # Need to wait for the future for the current read
837
+ if fetch_future is not None:
838
+ logger.info("BlockCache waiting for background fetch.")
839
+ # Wait until result and put it in cache
840
+ self._fetch_block_cached.add_key(
841
+ fetch_future.result(), fetch_future_block_number
842
+ )
843
+
844
+ # these are cached, so safe to do multiple calls for the same start and end.
845
+ for block_number in range(start_block_number, end_block_number + 1):
846
+ self._fetch_block_cached(block_number)
847
+
848
+ # fetch next block in the background if nothing is running in the background,
849
+ # the block is within file and it is not already cached
850
+ end_block_plus_1 = end_block_number + 1
851
+ with self._fetch_future_lock:
852
+ if (
853
+ self._fetch_future is None
854
+ and end_block_plus_1 <= self.nblocks
855
+ and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
856
+ ):
857
+ self._fetch_future_block_number = end_block_plus_1
858
+ self._fetch_future = self._thread_executor.submit(
859
+ self._fetch_block, end_block_plus_1, "async"
860
+ )
861
+
862
+ return self._read_cache(
863
+ start,
864
+ end,
865
+ start_block_number=start_block_number,
866
+ end_block_number=end_block_number,
867
+ )
868
+
869
+ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
870
+ """
871
+ Fetch the block of data for `block_number`.
872
+ """
873
+ if block_number > self.nblocks:
874
+ raise ValueError(
875
+ f"'block_number={block_number}' is greater than "
876
+ f"the number of blocks ({self.nblocks})"
877
+ )
878
+
879
+ start = block_number * self.blocksize
880
+ end = start + self.blocksize
881
+ logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
882
+ self.total_requested_bytes += end - start
883
+ self.miss_count += 1
884
+ block_contents = super()._fetch(start, end)
885
+ return block_contents
886
+
887
+ def _read_cache(
888
+ self, start: int, end: int, start_block_number: int, end_block_number: int
889
+ ) -> bytes:
890
+ """
891
+ Read from our block cache.
892
+
893
+ Parameters
894
+ ----------
895
+ start, end : int
896
+ The start and end byte positions.
897
+ start_block_number, end_block_number : int
898
+ The start and end block numbers.
899
+ """
900
+ start_pos = start % self.blocksize
901
+ end_pos = end % self.blocksize
902
+
903
+ # kind of pointless to count this as a hit, but it is
904
+ self.hit_count += 1
905
+
906
+ if start_block_number == end_block_number:
907
+ block = self._fetch_block_cached(start_block_number)
908
+ return block[start_pos:end_pos]
909
+
910
+ else:
911
+ # read from the initial
912
+ out = [self._fetch_block_cached(start_block_number)[start_pos:]]
913
+
914
+ # intermediate blocks
915
+ # Note: it'd be nice to combine these into one big request. However
916
+ # that doesn't play nicely with our LRU cache.
917
+ out.extend(
918
+ map(
919
+ self._fetch_block_cached,
920
+ range(start_block_number + 1, end_block_number),
921
+ )
922
+ )
923
+
924
+ # final block
925
+ out.append(self._fetch_block_cached(end_block_number)[:end_pos])
926
+
927
+ return b"".join(out)
928
+
929
+
930
+ caches: dict[str | None, type[BaseCache]] = {
931
+ # one custom case
932
+ None: BaseCache,
933
+ }
934
+
935
+
936
+ def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
937
+ """'Register' cache implementation.
938
+
939
+ Parameters
940
+ ----------
941
+ clobber: bool, optional
942
+ If set to True (default is False) - allow to overwrite existing
943
+ entry.
944
+
945
+ Raises
946
+ ------
947
+ ValueError
948
+ """
949
+ name = cls.name
950
+ if not clobber and name in caches:
951
+ raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
952
+ caches[name] = cls
953
+
954
+
955
+ for c in (
956
+ BaseCache,
957
+ MMapCache,
958
+ BytesCache,
959
+ ReadAheadCache,
960
+ BlockCache,
961
+ FirstChunkCache,
962
+ AllBytes,
963
+ KnownPartsOfAFile,
964
+ BackgroundBlockCache,
965
+ ):
966
+ register_cache(c)
.venv/lib/python3.11/site-packages/fsspec/callbacks.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import wraps
2
+
3
+
4
+ class Callback:
5
+ """
6
+ Base class and interface for callback mechanism
7
+
8
+ This class can be used directly for monitoring file transfers by
9
+ providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
10
+ below), or subclassed for more specialised behaviour.
11
+
12
+ Parameters
13
+ ----------
14
+ size: int (optional)
15
+ Nominal quantity for the value that corresponds to a complete
16
+ transfer, e.g., total number of tiles or total number of
17
+ bytes
18
+ value: int (0)
19
+ Starting internal counter value
20
+ hooks: dict or None
21
+ A dict of named functions to be called on each update. The signature
22
+ of these must be ``f(size, value, **kwargs)``
23
+ """
24
+
25
+ def __init__(self, size=None, value=0, hooks=None, **kwargs):
26
+ self.size = size
27
+ self.value = value
28
+ self.hooks = hooks or {}
29
+ self.kw = kwargs
30
+
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, *exc_args):
35
+ self.close()
36
+
37
+ def close(self):
38
+ """Close callback."""
39
+
40
+ def branched(self, path_1, path_2, **kwargs):
41
+ """
42
+ Return callback for child transfers
43
+
44
+ If this callback is operating at a higher level, e.g., put, which may
45
+ trigger transfers that can also be monitored. The function returns a callback
46
+ that has to be passed to the child method, e.g., put_file,
47
+ as `callback=` argument.
48
+
49
+ The implementation uses `callback.branch` for compatibility.
50
+ When implementing callbacks, it is recommended to override this function instead
51
+ of `branch` and avoid calling `super().branched(...)`.
52
+
53
+ Prefer using this function over `branch`.
54
+
55
+ Parameters
56
+ ----------
57
+ path_1: str
58
+ Child's source path
59
+ path_2: str
60
+ Child's destination path
61
+ **kwargs:
62
+ Arbitrary keyword arguments
63
+
64
+ Returns
65
+ -------
66
+ callback: Callback
67
+ A callback instance to be passed to the child method
68
+ """
69
+ self.branch(path_1, path_2, kwargs)
70
+ # mutate kwargs so that we can force the caller to pass "callback=" explicitly
71
+ return kwargs.pop("callback", DEFAULT_CALLBACK)
72
+
73
+ def branch_coro(self, fn):
74
+ """
75
+ Wraps a coroutine, and pass a new child callback to it.
76
+ """
77
+
78
+ @wraps(fn)
79
+ async def func(path1, path2: str, **kwargs):
80
+ with self.branched(path1, path2, **kwargs) as child:
81
+ return await fn(path1, path2, callback=child, **kwargs)
82
+
83
+ return func
84
+
85
+ def set_size(self, size):
86
+ """
87
+ Set the internal maximum size attribute
88
+
89
+ Usually called if not initially set at instantiation. Note that this
90
+ triggers a ``call()``.
91
+
92
+ Parameters
93
+ ----------
94
+ size: int
95
+ """
96
+ self.size = size
97
+ self.call()
98
+
99
+ def absolute_update(self, value):
100
+ """
101
+ Set the internal value state
102
+
103
+ Triggers ``call()``
104
+
105
+ Parameters
106
+ ----------
107
+ value: int
108
+ """
109
+ self.value = value
110
+ self.call()
111
+
112
+ def relative_update(self, inc=1):
113
+ """
114
+ Delta increment the internal counter
115
+
116
+ Triggers ``call()``
117
+
118
+ Parameters
119
+ ----------
120
+ inc: int
121
+ """
122
+ self.value += inc
123
+ self.call()
124
+
125
+ def call(self, hook_name=None, **kwargs):
126
+ """
127
+ Execute hook(s) with current state
128
+
129
+ Each function is passed the internal size and current value
130
+
131
+ Parameters
132
+ ----------
133
+ hook_name: str or None
134
+ If given, execute on this hook
135
+ kwargs: passed on to (all) hook(s)
136
+ """
137
+ if not self.hooks:
138
+ return
139
+ kw = self.kw.copy()
140
+ kw.update(kwargs)
141
+ if hook_name:
142
+ if hook_name not in self.hooks:
143
+ return
144
+ return self.hooks[hook_name](self.size, self.value, **kw)
145
+ for hook in self.hooks.values() or []:
146
+ hook(self.size, self.value, **kw)
147
+
148
+ def wrap(self, iterable):
149
+ """
150
+ Wrap an iterable to call ``relative_update`` on each iterations
151
+
152
+ Parameters
153
+ ----------
154
+ iterable: Iterable
155
+ The iterable that is being wrapped
156
+ """
157
+ for item in iterable:
158
+ self.relative_update()
159
+ yield item
160
+
161
+ def branch(self, path_1, path_2, kwargs):
162
+ """
163
+ Set callbacks for child transfers
164
+
165
+ If this callback is operating at a higher level, e.g., put, which may
166
+ trigger transfers that can also be monitored. The passed kwargs are
167
+ to be *mutated* to add ``callback=``, if this class supports branching
168
+ to children.
169
+
170
+ Parameters
171
+ ----------
172
+ path_1: str
173
+ Child's source path
174
+ path_2: str
175
+ Child's destination path
176
+ kwargs: dict
177
+ arguments passed to child method, e.g., put_file.
178
+
179
+ Returns
180
+ -------
181
+
182
+ """
183
+ return None
184
+
185
+ def no_op(self, *_, **__):
186
+ pass
187
+
188
+ def __getattr__(self, item):
189
+ """
190
+ If undefined methods are called on this class, nothing happens
191
+ """
192
+ return self.no_op
193
+
194
+ @classmethod
195
+ def as_callback(cls, maybe_callback=None):
196
+ """Transform callback=... into Callback instance
197
+
198
+ For the special value of ``None``, return the global instance of
199
+ ``NoOpCallback``. This is an alternative to including
200
+ ``callback=DEFAULT_CALLBACK`` directly in a method signature.
201
+ """
202
+ if maybe_callback is None:
203
+ return DEFAULT_CALLBACK
204
+ return maybe_callback
205
+
206
+
207
+ class NoOpCallback(Callback):
208
+ """
209
+ This implementation of Callback does exactly nothing
210
+ """
211
+
212
+ def call(self, *args, **kwargs):
213
+ return None
214
+
215
+
216
+ class DotPrinterCallback(Callback):
217
+ """
218
+ Simple example Callback implementation
219
+
220
+ Almost identical to Callback with a hook that prints a char; here we
221
+ demonstrate how the outer layer may print "#" and the inner layer "."
222
+ """
223
+
224
+ def __init__(self, chr_to_print="#", **kwargs):
225
+ self.chr = chr_to_print
226
+ super().__init__(**kwargs)
227
+
228
+ def branch(self, path_1, path_2, kwargs):
229
+ """Mutate kwargs to add new instance with different print char"""
230
+ kwargs["callback"] = DotPrinterCallback(".")
231
+
232
+ def call(self, **kwargs):
233
+ """Just outputs a character"""
234
+ print(self.chr, end="")
235
+
236
+
237
+ class TqdmCallback(Callback):
238
+ """
239
+ A callback to display a progress bar using tqdm
240
+
241
+ Parameters
242
+ ----------
243
+ tqdm_kwargs : dict, (optional)
244
+ Any argument accepted by the tqdm constructor.
245
+ See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
246
+ Will be forwarded to `tqdm_cls`.
247
+ tqdm_cls: (optional)
248
+ subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
249
+
250
+ Examples
251
+ --------
252
+ >>> import fsspec
253
+ >>> from fsspec.callbacks import TqdmCallback
254
+ >>> fs = fsspec.filesystem("memory")
255
+ >>> path2distant_data = "/your-path"
256
+ >>> fs.upload(
257
+ ".",
258
+ path2distant_data,
259
+ recursive=True,
260
+ callback=TqdmCallback(),
261
+ )
262
+
263
+ You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
264
+
265
+ >>> fs.upload(
266
+ ".",
267
+ path2distant_data,
268
+ recursive=True,
269
+ callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
270
+ )
271
+
272
+ You can also customize the progress bar by passing a subclass of `tqdm`.
273
+
274
+ .. code-block:: python
275
+
276
+ class TqdmFormat(tqdm):
277
+ '''Provides a `total_time` format parameter'''
278
+ @property
279
+ def format_dict(self):
280
+ d = super().format_dict
281
+ total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
282
+ d.update(total_time=self.format_interval(total_time) + " in total")
283
+ return d
284
+
285
+ >>> with TqdmCallback(
286
+ tqdm_kwargs={
287
+ "desc": "desc",
288
+ "bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
289
+ },
290
+ tqdm_cls=TqdmFormat,
291
+ ) as callback:
292
+ fs.upload(".", path2distant_data, recursive=True, callback=callback)
293
+ """
294
+
295
+ def __init__(self, tqdm_kwargs=None, *args, **kwargs):
296
+ try:
297
+ from tqdm import tqdm
298
+
299
+ except ImportError as exce:
300
+ raise ImportError(
301
+ "Using TqdmCallback requires tqdm to be installed"
302
+ ) from exce
303
+
304
+ self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
305
+ self._tqdm_kwargs = tqdm_kwargs or {}
306
+ self.tqdm = None
307
+ super().__init__(*args, **kwargs)
308
+
309
+ def call(self, *args, **kwargs):
310
+ if self.tqdm is None:
311
+ self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
312
+ self.tqdm.total = self.size
313
+ self.tqdm.update(self.value - self.tqdm.n)
314
+
315
+ def close(self):
316
+ if self.tqdm is not None:
317
+ self.tqdm.close()
318
+ self.tqdm = None
319
+
320
+ def __del__(self):
321
+ return self.close()
322
+
323
+
324
+ DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()
.venv/lib/python3.11/site-packages/fsspec/compression.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper functions for a standard streaming compression API"""
2
+
3
+ from zipfile import ZipFile
4
+
5
+ import fsspec.utils
6
+ from fsspec.spec import AbstractBufferedFile
7
+
8
+
9
+ def noop_file(file, mode, **kwargs):
10
+ return file
11
+
12
+
13
+ # TODO: files should also be available as contexts
14
+ # should be functions of the form func(infile, mode=, **kwargs) -> file-like
15
+ compr = {None: noop_file}
16
+
17
+
18
+ def register_compression(name, callback, extensions, force=False):
19
+ """Register an "inferable" file compression type.
20
+
21
+ Registers transparent file compression type for use with fsspec.open.
22
+ Compression can be specified by name in open, or "infer"-ed for any files
23
+ ending with the given extensions.
24
+
25
+ Args:
26
+ name: (str) The compression type name. Eg. "gzip".
27
+ callback: A callable of form (infile, mode, **kwargs) -> file-like.
28
+ Accepts an input file-like object, the target mode and kwargs.
29
+ Returns a wrapped file-like object.
30
+ extensions: (str, Iterable[str]) A file extension, or list of file
31
+ extensions for which to infer this compression scheme. Eg. "gz".
32
+ force: (bool) Force re-registration of compression type or extensions.
33
+
34
+ Raises:
35
+ ValueError: If name or extensions already registered, and not force.
36
+
37
+ """
38
+ if isinstance(extensions, str):
39
+ extensions = [extensions]
40
+
41
+ # Validate registration
42
+ if name in compr and not force:
43
+ raise ValueError(f"Duplicate compression registration: {name}")
44
+
45
+ for ext in extensions:
46
+ if ext in fsspec.utils.compressions and not force:
47
+ raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
48
+
49
+ compr[name] = callback
50
+
51
+ for ext in extensions:
52
+ fsspec.utils.compressions[ext] = name
53
+
54
+
55
+ def unzip(infile, mode="rb", filename=None, **kwargs):
56
+ if "r" not in mode:
57
+ filename = filename or "file"
58
+ z = ZipFile(infile, mode="w", **kwargs)
59
+ fo = z.open(filename, mode="w")
60
+ fo.close = lambda closer=fo.close: closer() or z.close()
61
+ return fo
62
+ z = ZipFile(infile)
63
+ if filename is None:
64
+ filename = z.namelist()[0]
65
+ return z.open(filename, mode="r", **kwargs)
66
+
67
+
68
+ register_compression("zip", unzip, "zip")
69
+
70
+ try:
71
+ from bz2 import BZ2File
72
+ except ImportError:
73
+ pass
74
+ else:
75
+ register_compression("bz2", BZ2File, "bz2")
76
+
77
+ try: # pragma: no cover
78
+ from isal import igzip
79
+
80
+ def isal(infile, mode="rb", **kwargs):
81
+ return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
82
+
83
+ register_compression("gzip", isal, "gz")
84
+ except ImportError:
85
+ from gzip import GzipFile
86
+
87
+ register_compression(
88
+ "gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
89
+ )
90
+
91
+ try:
92
+ from lzma import LZMAFile
93
+
94
+ register_compression("lzma", LZMAFile, "lzma")
95
+ register_compression("xz", LZMAFile, "xz")
96
+ except ImportError:
97
+ pass
98
+
99
+ try:
100
+ import lzmaffi
101
+
102
+ register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
103
+ register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
104
+ except ImportError:
105
+ pass
106
+
107
+
108
+ class SnappyFile(AbstractBufferedFile):
109
+ def __init__(self, infile, mode, **kwargs):
110
+ import snappy
111
+
112
+ super().__init__(
113
+ fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
114
+ )
115
+ self.infile = infile
116
+ if "r" in mode:
117
+ self.codec = snappy.StreamDecompressor()
118
+ else:
119
+ self.codec = snappy.StreamCompressor()
120
+
121
+ def _upload_chunk(self, final=False):
122
+ self.buffer.seek(0)
123
+ out = self.codec.add_chunk(self.buffer.read())
124
+ self.infile.write(out)
125
+ return True
126
+
127
+ def seek(self, loc, whence=0):
128
+ raise NotImplementedError("SnappyFile is not seekable")
129
+
130
+ def seekable(self):
131
+ return False
132
+
133
+ def _fetch_range(self, start, end):
134
+ """Get the specified set of bytes from remote"""
135
+ data = self.infile.read(end - start)
136
+ return self.codec.decompress(data)
137
+
138
+
139
+ try:
140
+ import snappy
141
+
142
+ snappy.compress(b"")
143
+ # Snappy may use the .sz file extension, but this is not part of the
144
+ # standard implementation.
145
+ register_compression("snappy", SnappyFile, [])
146
+
147
+ except (ImportError, NameError, AttributeError):
148
+ pass
149
+
150
+ try:
151
+ import lz4.frame
152
+
153
+ register_compression("lz4", lz4.frame.open, "lz4")
154
+ except ImportError:
155
+ pass
156
+
157
+ try:
158
+ import zstandard as zstd
159
+
160
+ def zstandard_file(infile, mode="rb"):
161
+ if "r" in mode:
162
+ cctx = zstd.ZstdDecompressor()
163
+ return cctx.stream_reader(infile)
164
+ else:
165
+ cctx = zstd.ZstdCompressor(level=10)
166
+ return cctx.stream_writer(infile)
167
+
168
+ register_compression("zstd", zstandard_file, "zst")
169
+ except ImportError:
170
+ pass
171
+
172
+
173
+ def available_compressions():
174
+ """Return a list of the implemented compressions."""
175
+ return list(compr)
.venv/lib/python3.11/site-packages/fsspec/config.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import json
5
+ import os
6
+ import warnings
7
+ from typing import Any
8
+
9
+ conf: dict[str, dict[str, Any]] = {}
10
+ default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
11
+ conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
12
+
13
+
14
+ def set_conf_env(conf_dict, envdict=os.environ):
15
+ """Set config values from environment variables
16
+
17
+ Looks for variables of the form ``FSSPEC_<protocol>`` and
18
+ ``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
19
+ as a json dictionary and used to ``update`` the config of the
20
+ corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
21
+ attempt to convert the string value, but the kwarg keys will be lower-cased.
22
+
23
+ The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
24
+ ``FSSPEC_<protocol>`` ones.
25
+
26
+ Parameters
27
+ ----------
28
+ conf_dict : dict(str, dict)
29
+ This dict will be mutated
30
+ envdict : dict-like(str, str)
31
+ Source for the values - usually the real environment
32
+ """
33
+ kwarg_keys = []
34
+ for key in envdict:
35
+ if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
36
+ if key.count("_") > 1:
37
+ kwarg_keys.append(key)
38
+ continue
39
+ try:
40
+ value = json.loads(envdict[key])
41
+ except json.decoder.JSONDecodeError as ex:
42
+ warnings.warn(
43
+ f"Ignoring environment variable {key} due to a parse failure: {ex}"
44
+ )
45
+ else:
46
+ if isinstance(value, dict):
47
+ _, proto = key.split("_", 1)
48
+ conf_dict.setdefault(proto.lower(), {}).update(value)
49
+ else:
50
+ warnings.warn(
51
+ f"Ignoring environment variable {key} due to not being a dict:"
52
+ f" {type(value)}"
53
+ )
54
+ elif key.startswith("FSSPEC"):
55
+ warnings.warn(
56
+ f"Ignoring environment variable {key} due to having an unexpected name"
57
+ )
58
+
59
+ for key in kwarg_keys:
60
+ _, proto, kwarg = key.split("_", 2)
61
+ conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
62
+
63
+
64
+ def set_conf_files(cdir, conf_dict):
65
+ """Set config values from files
66
+
67
+ Scans for INI and JSON files in the given dictionary, and uses their
68
+ contents to set the config. In case of repeated values, later values
69
+ win.
70
+
71
+ In the case of INI files, all values are strings, and these will not
72
+ be converted.
73
+
74
+ Parameters
75
+ ----------
76
+ cdir : str
77
+ Directory to search
78
+ conf_dict : dict(str, dict)
79
+ This dict will be mutated
80
+ """
81
+ if not os.path.isdir(cdir):
82
+ return
83
+ allfiles = sorted(os.listdir(cdir))
84
+ for fn in allfiles:
85
+ if fn.endswith(".ini"):
86
+ ini = configparser.ConfigParser()
87
+ ini.read(os.path.join(cdir, fn))
88
+ for key in ini:
89
+ if key == "DEFAULT":
90
+ continue
91
+ conf_dict.setdefault(key, {}).update(dict(ini[key]))
92
+ if fn.endswith(".json"):
93
+ with open(os.path.join(cdir, fn)) as f:
94
+ js = json.load(f)
95
+ for key in js:
96
+ conf_dict.setdefault(key, {}).update(dict(js[key]))
97
+
98
+
99
+ def apply_config(cls, kwargs, conf_dict=None):
100
+ """Supply default values for kwargs when instantiating class
101
+
102
+ Augments the passed kwargs, by finding entries in the config dict
103
+ which match the classes ``.protocol`` attribute (one or more str)
104
+
105
+ Parameters
106
+ ----------
107
+ cls : file system implementation
108
+ kwargs : dict
109
+ conf_dict : dict of dict
110
+ Typically this is the global configuration
111
+
112
+ Returns
113
+ -------
114
+ dict : the modified set of kwargs
115
+ """
116
+ if conf_dict is None:
117
+ conf_dict = conf
118
+ protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
119
+ kw = {}
120
+ for proto in protos:
121
+ # default kwargs from the current state of the config
122
+ if proto in conf_dict:
123
+ kw.update(conf_dict[proto])
124
+ # explicit kwargs always win
125
+ kw.update(**kwargs)
126
+ kwargs = kw
127
+ return kwargs
128
+
129
+
130
+ set_conf_files(conf_dir, conf)
131
+ set_conf_env(conf)
.venv/lib/python3.11/site-packages/fsspec/conftest.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import sys
5
+ import time
6
+
7
+ import pytest
8
+
9
+ import fsspec
10
+ from fsspec.implementations.cached import CachingFileSystem
11
+
12
+
13
+ @pytest.fixture()
14
+ def m():
15
+ """
16
+ Fixture providing a memory filesystem.
17
+ """
18
+ m = fsspec.filesystem("memory")
19
+ m.store.clear()
20
+ m.pseudo_dirs.clear()
21
+ m.pseudo_dirs.append("")
22
+ try:
23
+ yield m
24
+ finally:
25
+ m.store.clear()
26
+ m.pseudo_dirs.clear()
27
+ m.pseudo_dirs.append("")
28
+
29
+
30
+ @pytest.fixture
31
+ def ftp_writable(tmpdir):
32
+ """
33
+ Fixture providing a writable FTP filesystem.
34
+ """
35
+ pytest.importorskip("pyftpdlib")
36
+ from fsspec.implementations.ftp import FTPFileSystem
37
+
38
+ FTPFileSystem.clear_instance_cache() # remove lingering connections
39
+ CachingFileSystem.clear_instance_cache()
40
+ d = str(tmpdir)
41
+ with open(os.path.join(d, "out"), "wb") as f:
42
+ f.write(b"hello" * 10000)
43
+ P = subprocess.Popen(
44
+ [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
45
+ )
46
+ try:
47
+ time.sleep(1)
48
+ yield "localhost", 2121, "user", "pass"
49
+ finally:
50
+ P.terminate()
51
+ P.wait()
52
+ try:
53
+ shutil.rmtree(tmpdir)
54
+ except Exception:
55
+ pass
.venv/lib/python3.11/site-packages/fsspec/core.py ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ import re
7
+ from glob import has_magic
8
+ from pathlib import Path
9
+
10
+ # for backwards compat, we export cache things from here too
11
+ from fsspec.caching import ( # noqa: F401
12
+ BaseCache,
13
+ BlockCache,
14
+ BytesCache,
15
+ MMapCache,
16
+ ReadAheadCache,
17
+ caches,
18
+ )
19
+ from fsspec.compression import compr
20
+ from fsspec.config import conf
21
+ from fsspec.registry import filesystem, get_filesystem_class
22
+ from fsspec.utils import (
23
+ _unstrip_protocol,
24
+ build_name_function,
25
+ infer_compression,
26
+ stringify_path,
27
+ )
28
+
29
+ logger = logging.getLogger("fsspec")
30
+
31
+
32
+ class OpenFile:
33
+ """
34
+ File-like object to be used in a context
35
+
36
+ Can layer (buffered) text-mode and compression over any file-system, which
37
+ are typically binary-only.
38
+
39
+ These instances are safe to serialize, as the low-level file object
40
+ is not created until invoked using ``with``.
41
+
42
+ Parameters
43
+ ----------
44
+ fs: FileSystem
45
+ The file system to use for opening the file. Should be a subclass or duck-type
46
+ with ``fsspec.spec.AbstractFileSystem``
47
+ path: str
48
+ Location to open
49
+ mode: str like 'rb', optional
50
+ Mode of the opened file
51
+ compression: str or None, optional
52
+ Compression to apply
53
+ encoding: str or None, optional
54
+ The encoding to use if opened in text mode.
55
+ errors: str or None, optional
56
+ How to handle encoding errors if opened in text mode.
57
+ newline: None or str
58
+ Passed to TextIOWrapper in text mode, how to handle line endings.
59
+ autoopen: bool
60
+ If True, calls open() immediately. Mostly used by pickle
61
+ pos: int
62
+ If given and autoopen is True, seek to this location immediately
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ fs,
68
+ path,
69
+ mode="rb",
70
+ compression=None,
71
+ encoding=None,
72
+ errors=None,
73
+ newline=None,
74
+ ):
75
+ self.fs = fs
76
+ self.path = path
77
+ self.mode = mode
78
+ self.compression = get_compression(path, compression)
79
+ self.encoding = encoding
80
+ self.errors = errors
81
+ self.newline = newline
82
+ self.fobjects = []
83
+
84
+ def __reduce__(self):
85
+ return (
86
+ OpenFile,
87
+ (
88
+ self.fs,
89
+ self.path,
90
+ self.mode,
91
+ self.compression,
92
+ self.encoding,
93
+ self.errors,
94
+ self.newline,
95
+ ),
96
+ )
97
+
98
+ def __repr__(self):
99
+ return f"<OpenFile '{self.path}'>"
100
+
101
+ def __enter__(self):
102
+ mode = self.mode.replace("t", "").replace("b", "") + "b"
103
+
104
+ try:
105
+ f = self.fs.open(self.path, mode=mode)
106
+ except FileNotFoundError as e:
107
+ if has_magic(self.path):
108
+ raise FileNotFoundError(
109
+ "%s not found. The URL contains glob characters: you maybe needed\n"
110
+ "to pass expand=True in fsspec.open() or the storage_options of \n"
111
+ "your library. You can also set the config value 'open_expand'\n"
112
+ "before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
113
+ self.path,
114
+ ) from e
115
+ raise
116
+
117
+ self.fobjects = [f]
118
+
119
+ if self.compression is not None:
120
+ compress = compr[self.compression]
121
+ f = compress(f, mode=mode[0])
122
+ self.fobjects.append(f)
123
+
124
+ if "b" not in self.mode:
125
+ # assume, for example, that 'r' is equivalent to 'rt' as in builtin
126
+ f = PickleableTextIOWrapper(
127
+ f, encoding=self.encoding, errors=self.errors, newline=self.newline
128
+ )
129
+ self.fobjects.append(f)
130
+
131
+ return self.fobjects[-1]
132
+
133
+ def __exit__(self, *args):
134
+ self.close()
135
+
136
+ @property
137
+ def full_name(self):
138
+ return _unstrip_protocol(self.path, self.fs)
139
+
140
+ def open(self):
141
+ """Materialise this as a real open file without context
142
+
143
+ The OpenFile object should be explicitly closed to avoid enclosed file
144
+ instances persisting. You must, therefore, keep a reference to the OpenFile
145
+ during the life of the file-like it generates.
146
+ """
147
+ return self.__enter__()
148
+
149
+ def close(self):
150
+ """Close all encapsulated file objects"""
151
+ for f in reversed(self.fobjects):
152
+ if "r" not in self.mode and not f.closed:
153
+ f.flush()
154
+ f.close()
155
+ self.fobjects.clear()
156
+
157
+
158
+ class OpenFiles(list):
159
+ """List of OpenFile instances
160
+
161
+ Can be used in a single context, which opens and closes all of the
162
+ contained files. Normal list access to get the elements works as
163
+ normal.
164
+
165
+ A special case is made for caching filesystems - the files will
166
+ be down/uploaded together at the start or end of the context, and
167
+ this may happen concurrently, if the target filesystem supports it.
168
+ """
169
+
170
+ def __init__(self, *args, mode="rb", fs=None):
171
+ self.mode = mode
172
+ self.fs = fs
173
+ self.files = []
174
+ super().__init__(*args)
175
+
176
+ def __enter__(self):
177
+ if self.fs is None:
178
+ raise ValueError("Context has already been used")
179
+
180
+ fs = self.fs
181
+ while True:
182
+ if hasattr(fs, "open_many"):
183
+ # check for concurrent cache download; or set up for upload
184
+ self.files = fs.open_many(self)
185
+ return self.files
186
+ if hasattr(fs, "fs") and fs.fs is not None:
187
+ fs = fs.fs
188
+ else:
189
+ break
190
+ return [s.__enter__() for s in self]
191
+
192
+ def __exit__(self, *args):
193
+ fs = self.fs
194
+ [s.__exit__(*args) for s in self]
195
+ if "r" not in self.mode:
196
+ while True:
197
+ if hasattr(fs, "open_many"):
198
+ # check for concurrent cache upload
199
+ fs.commit_many(self.files)
200
+ return
201
+ if hasattr(fs, "fs") and fs.fs is not None:
202
+ fs = fs.fs
203
+ else:
204
+ break
205
+
206
+ def __getitem__(self, item):
207
+ out = super().__getitem__(item)
208
+ if isinstance(item, slice):
209
+ return OpenFiles(out, mode=self.mode, fs=self.fs)
210
+ return out
211
+
212
+ def __repr__(self):
213
+ return f"<List of {len(self)} OpenFile instances>"
214
+
215
+
216
+ def open_files(
217
+ urlpath,
218
+ mode="rb",
219
+ compression=None,
220
+ encoding="utf8",
221
+ errors=None,
222
+ name_function=None,
223
+ num=1,
224
+ protocol=None,
225
+ newline=None,
226
+ auto_mkdir=True,
227
+ expand=True,
228
+ **kwargs,
229
+ ):
230
+ """Given a path or paths, return a list of ``OpenFile`` objects.
231
+
232
+ For writing, a str path must contain the "*" character, which will be filled
233
+ in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
234
+
235
+ For either reading or writing, can instead provide explicit list of paths.
236
+
237
+ Parameters
238
+ ----------
239
+ urlpath: string or list
240
+ Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
241
+ to read from alternative filesystems. To read from multiple files you
242
+ can pass a globstring or a list of paths, with the caveat that they
243
+ must all have the same protocol.
244
+ mode: 'rb', 'wt', etc.
245
+ compression: string or None
246
+ If given, open file using compression codec. Can either be a compression
247
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
248
+ compression from the filename suffix.
249
+ encoding: str
250
+ For text mode only
251
+ errors: None or str
252
+ Passed to TextIOWrapper in text mode
253
+ name_function: function or None
254
+ if opening a set of files for writing, those files do not yet exist,
255
+ so we need to generate their names by formatting the urlpath for
256
+ each sequence number
257
+ num: int [1]
258
+ if writing mode, number of files we expect to create (passed to
259
+ name+function)
260
+ protocol: str or None
261
+ If given, overrides the protocol found in the URL.
262
+ newline: bytes or None
263
+ Used for line terminator in text mode. If None, uses system default;
264
+ if blank, uses no translation.
265
+ auto_mkdir: bool (True)
266
+ If in write mode, this will ensure the target directory exists before
267
+ writing, by calling ``fs.mkdirs(exist_ok=True)``.
268
+ expand: bool
269
+ **kwargs: dict
270
+ Extra options that make sense to a particular storage connection, e.g.
271
+ host, port, username, password, etc.
272
+
273
+ Examples
274
+ --------
275
+ >>> files = open_files('2015-*-*.csv') # doctest: +SKIP
276
+ >>> files = open_files(
277
+ ... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
278
+ ... ) # doctest: +SKIP
279
+
280
+ Returns
281
+ -------
282
+ An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
283
+ be used as a single context
284
+
285
+ Notes
286
+ -----
287
+ For a full list of the available protocols and the implementations that
288
+ they map across to see the latest online documentation:
289
+
290
+ - For implementations built into ``fsspec`` see
291
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
292
+ - For implementations in separate packages see
293
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
294
+ """
295
+ fs, fs_token, paths = get_fs_token_paths(
296
+ urlpath,
297
+ mode,
298
+ num=num,
299
+ name_function=name_function,
300
+ storage_options=kwargs,
301
+ protocol=protocol,
302
+ expand=expand,
303
+ )
304
+ if fs.protocol == "file":
305
+ fs.auto_mkdir = auto_mkdir
306
+ elif "r" not in mode and auto_mkdir:
307
+ parents = {fs._parent(path) for path in paths}
308
+ for parent in parents:
309
+ try:
310
+ fs.makedirs(parent, exist_ok=True)
311
+ except PermissionError:
312
+ pass
313
+ return OpenFiles(
314
+ [
315
+ OpenFile(
316
+ fs,
317
+ path,
318
+ mode=mode,
319
+ compression=compression,
320
+ encoding=encoding,
321
+ errors=errors,
322
+ newline=newline,
323
+ )
324
+ for path in paths
325
+ ],
326
+ mode=mode,
327
+ fs=fs,
328
+ )
329
+
330
+
331
+ def _un_chain(path, kwargs):
332
+ # Avoid a circular import
333
+ from fsspec.implementations.cached import CachingFileSystem
334
+
335
+ if "::" in path:
336
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ bits = []
338
+ for p in path.split("::"):
339
+ if "://" in p or x.match(p):
340
+ bits.append(p)
341
+ else:
342
+ bits.append(p + "://")
343
+ else:
344
+ bits = [path]
345
+ # [[url, protocol, kwargs], ...]
346
+ out = []
347
+ previous_bit = None
348
+ kwargs = kwargs.copy()
349
+ for bit in reversed(bits):
350
+ protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
351
+ cls = get_filesystem_class(protocol)
352
+ extra_kwargs = cls._get_kwargs_from_urls(bit)
353
+ kws = kwargs.pop(protocol, {})
354
+ if bit is bits[0]:
355
+ kws.update(kwargs)
356
+ kw = dict(
357
+ **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
358
+ **kws,
359
+ )
360
+ bit = cls._strip_protocol(bit)
361
+ if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
362
+ bit = previous_bit
363
+ out.append((bit, protocol, kw))
364
+ previous_bit = bit
365
+ out.reverse()
366
+ return out
367
+
368
+
369
+ def url_to_fs(url, **kwargs):
370
+ """
371
+ Turn fully-qualified and potentially chained URL into filesystem instance
372
+
373
+ Parameters
374
+ ----------
375
+ url : str
376
+ The fsspec-compatible URL
377
+ **kwargs: dict
378
+ Extra options that make sense to a particular storage connection, e.g.
379
+ host, port, username, password, etc.
380
+
381
+ Returns
382
+ -------
383
+ filesystem : FileSystem
384
+ The new filesystem discovered from ``url`` and created with
385
+ ``**kwargs``.
386
+ urlpath : str
387
+ The file-systems-specific URL for ``url``.
388
+ """
389
+ url = stringify_path(url)
390
+ # non-FS arguments that appear in fsspec.open()
391
+ # inspect could keep this in sync with open()'s signature
392
+ known_kwargs = {
393
+ "compression",
394
+ "encoding",
395
+ "errors",
396
+ "expand",
397
+ "mode",
398
+ "name_function",
399
+ "newline",
400
+ "num",
401
+ }
402
+ kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
403
+ chain = _un_chain(url, kwargs)
404
+ inkwargs = {}
405
+ # Reverse iterate the chain, creating a nested target_* structure
406
+ for i, ch in enumerate(reversed(chain)):
407
+ urls, protocol, kw = ch
408
+ if i == len(chain) - 1:
409
+ inkwargs = dict(**kw, **inkwargs)
410
+ continue
411
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
412
+ inkwargs["target_protocol"] = protocol
413
+ inkwargs["fo"] = urls
414
+ urlpath, protocol, _ = chain[0]
415
+ fs = filesystem(protocol, **inkwargs)
416
+ return fs, urlpath
417
+
418
+
419
+ DEFAULT_EXPAND = conf.get("open_expand", False)
420
+
421
+
422
+ def open(
423
+ urlpath,
424
+ mode="rb",
425
+ compression=None,
426
+ encoding="utf8",
427
+ errors=None,
428
+ protocol=None,
429
+ newline=None,
430
+ expand=None,
431
+ **kwargs,
432
+ ):
433
+ """Given a path or paths, return one ``OpenFile`` object.
434
+
435
+ Parameters
436
+ ----------
437
+ urlpath: string or list
438
+ Absolute or relative filepath. Prefix with a protocol like ``s3://``
439
+ to read from alternative filesystems. Should not include glob
440
+ character(s).
441
+ mode: 'rb', 'wt', etc.
442
+ compression: string or None
443
+ If given, open file using compression codec. Can either be a compression
444
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
445
+ compression from the filename suffix.
446
+ encoding: str
447
+ For text mode only
448
+ errors: None or str
449
+ Passed to TextIOWrapper in text mode
450
+ protocol: str or None
451
+ If given, overrides the protocol found in the URL.
452
+ newline: bytes or None
453
+ Used for line terminator in text mode. If None, uses system default;
454
+ if blank, uses no translation.
455
+ expand: bool or Nonw
456
+ Whether to regard file paths containing special glob characters as needing
457
+ expansion (finding the first match) or absolute. Setting False allows using
458
+ paths which do embed such characters. If None (default), this argument
459
+ takes its value from the DEFAULT_EXPAND module variable, which takes
460
+ its initial value from the "open_expand" config value at startup, which will
461
+ be False if not set.
462
+ **kwargs: dict
463
+ Extra options that make sense to a particular storage connection, e.g.
464
+ host, port, username, password, etc.
465
+
466
+ Examples
467
+ --------
468
+ >>> openfile = open('2015-01-01.csv') # doctest: +SKIP
469
+ >>> openfile = open(
470
+ ... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
471
+ ... ) # doctest: +SKIP
472
+ >>> with openfile as f:
473
+ ... df = pd.read_csv(f) # doctest: +SKIP
474
+ ...
475
+
476
+ Returns
477
+ -------
478
+ ``OpenFile`` object.
479
+
480
+ Notes
481
+ -----
482
+ For a full list of the available protocols and the implementations that
483
+ they map across to see the latest online documentation:
484
+
485
+ - For implementations built into ``fsspec`` see
486
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
487
+ - For implementations in separate packages see
488
+ https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
489
+ """
490
+ expand = DEFAULT_EXPAND if expand is None else expand
491
+ out = open_files(
492
+ urlpath=[urlpath],
493
+ mode=mode,
494
+ compression=compression,
495
+ encoding=encoding,
496
+ errors=errors,
497
+ protocol=protocol,
498
+ newline=newline,
499
+ expand=expand,
500
+ **kwargs,
501
+ )
502
+ if not out:
503
+ raise FileNotFoundError(urlpath)
504
+ return out[0]
505
+
506
+
507
+ def open_local(
508
+ url: str | list[str] | Path | list[Path],
509
+ mode: str = "rb",
510
+ **storage_options: dict,
511
+ ) -> str | list[str]:
512
+ """Open file(s) which can be resolved to local
513
+
514
+ For files which either are local, or get downloaded upon open
515
+ (e.g., by file caching)
516
+
517
+ Parameters
518
+ ----------
519
+ url: str or list(str)
520
+ mode: str
521
+ Must be read mode
522
+ storage_options:
523
+ passed on to FS for or used by open_files (e.g., compression)
524
+ """
525
+ if "r" not in mode:
526
+ raise ValueError("Can only ensure local files when reading")
527
+ of = open_files(url, mode=mode, **storage_options)
528
+ if not getattr(of[0].fs, "local_file", False):
529
+ raise ValueError(
530
+ "open_local can only be used on a filesystem which"
531
+ " has attribute local_file=True"
532
+ )
533
+ with of as files:
534
+ paths = [f.name for f in files]
535
+ if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
536
+ return paths[0]
537
+ return paths
538
+
539
+
540
+ def get_compression(urlpath, compression):
541
+ if compression == "infer":
542
+ compression = infer_compression(urlpath)
543
+ if compression is not None and compression not in compr:
544
+ raise ValueError(f"Compression type {compression} not supported")
545
+ return compression
546
+
547
+
548
+ def split_protocol(urlpath):
549
+ """Return protocol, path pair"""
550
+ urlpath = stringify_path(urlpath)
551
+ if "://" in urlpath:
552
+ protocol, path = urlpath.split("://", 1)
553
+ if len(protocol) > 1:
554
+ # excludes Windows paths
555
+ return protocol, path
556
+ if urlpath.startswith("data:"):
557
+ return urlpath.split(":", 1)
558
+ return None, urlpath
559
+
560
+
561
+ def strip_protocol(urlpath):
562
+ """Return only path part of full URL, according to appropriate backend"""
563
+ protocol, _ = split_protocol(urlpath)
564
+ cls = get_filesystem_class(protocol)
565
+ return cls._strip_protocol(urlpath)
566
+
567
+
568
+ def expand_paths_if_needed(paths, mode, num, fs, name_function):
569
+ """Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
570
+ in them (read mode).
571
+
572
+ :param paths: list of paths
573
+ mode: str
574
+ Mode in which to open files.
575
+ num: int
576
+ If opening in writing mode, number of files we expect to create.
577
+ fs: filesystem object
578
+ name_function: callable
579
+ If opening in writing mode, this callable is used to generate path
580
+ names. Names are generated for each partition by
581
+ ``urlpath.replace('*', name_function(partition_index))``.
582
+ :return: list of paths
583
+ """
584
+ expanded_paths = []
585
+ paths = list(paths)
586
+
587
+ if "w" in mode: # read mode
588
+ if sum(1 for p in paths if "*" in p) > 1:
589
+ raise ValueError(
590
+ "When writing data, only one filename mask can be specified."
591
+ )
592
+ num = max(num, len(paths))
593
+
594
+ for curr_path in paths:
595
+ if "*" in curr_path:
596
+ # expand using name_function
597
+ expanded_paths.extend(_expand_paths(curr_path, name_function, num))
598
+ else:
599
+ expanded_paths.append(curr_path)
600
+ # if we generated more paths that asked for, trim the list
601
+ if len(expanded_paths) > num:
602
+ expanded_paths = expanded_paths[:num]
603
+
604
+ else: # read mode
605
+ for curr_path in paths:
606
+ if has_magic(curr_path):
607
+ # expand using glob
608
+ expanded_paths.extend(fs.glob(curr_path))
609
+ else:
610
+ expanded_paths.append(curr_path)
611
+
612
+ return expanded_paths
613
+
614
+
615
+ def get_fs_token_paths(
616
+ urlpath,
617
+ mode="rb",
618
+ num=1,
619
+ name_function=None,
620
+ storage_options=None,
621
+ protocol=None,
622
+ expand=True,
623
+ ):
624
+ """Filesystem, deterministic token, and paths from a urlpath and options.
625
+
626
+ Parameters
627
+ ----------
628
+ urlpath: string or iterable
629
+ Absolute or relative filepath, URL (may include protocols like
630
+ ``s3://``), or globstring pointing to data.
631
+ mode: str, optional
632
+ Mode in which to open files.
633
+ num: int, optional
634
+ If opening in writing mode, number of files we expect to create.
635
+ name_function: callable, optional
636
+ If opening in writing mode, this callable is used to generate path
637
+ names. Names are generated for each partition by
638
+ ``urlpath.replace('*', name_function(partition_index))``.
639
+ storage_options: dict, optional
640
+ Additional keywords to pass to the filesystem class.
641
+ protocol: str or None
642
+ To override the protocol specifier in the URL
643
+ expand: bool
644
+ Expand string paths for writing, assuming the path is a directory
645
+ """
646
+ if isinstance(urlpath, (list, tuple, set)):
647
+ if not urlpath:
648
+ raise ValueError("empty urlpath sequence")
649
+ urlpath0 = stringify_path(next(iter(urlpath)))
650
+ else:
651
+ urlpath0 = stringify_path(urlpath)
652
+ storage_options = storage_options or {}
653
+ if protocol:
654
+ storage_options["protocol"] = protocol
655
+ chain = _un_chain(urlpath0, storage_options or {})
656
+ inkwargs = {}
657
+ # Reverse iterate the chain, creating a nested target_* structure
658
+ for i, ch in enumerate(reversed(chain)):
659
+ urls, nested_protocol, kw = ch
660
+ if i == len(chain) - 1:
661
+ inkwargs = dict(**kw, **inkwargs)
662
+ continue
663
+ inkwargs["target_options"] = dict(**kw, **inkwargs)
664
+ inkwargs["target_protocol"] = nested_protocol
665
+ inkwargs["fo"] = urls
666
+ paths, protocol, _ = chain[0]
667
+ fs = filesystem(protocol, **inkwargs)
668
+ if isinstance(urlpath, (list, tuple, set)):
669
+ pchains = [
670
+ _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
671
+ ]
672
+ if len({pc[1] for pc in pchains}) > 1:
673
+ raise ValueError("Protocol mismatch getting fs from %s", urlpath)
674
+ paths = [pc[0] for pc in pchains]
675
+ else:
676
+ paths = fs._strip_protocol(paths)
677
+ if isinstance(paths, (list, tuple, set)):
678
+ if expand:
679
+ paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
680
+ elif not isinstance(paths, list):
681
+ paths = list(paths)
682
+ else:
683
+ if ("w" in mode or "x" in mode) and expand:
684
+ paths = _expand_paths(paths, name_function, num)
685
+ elif "*" in paths:
686
+ paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
687
+ else:
688
+ paths = [paths]
689
+
690
+ return fs, fs._fs_token, paths
691
+
692
+
693
+ def _expand_paths(path, name_function, num):
694
+ if isinstance(path, str):
695
+ if path.count("*") > 1:
696
+ raise ValueError("Output path spec must contain exactly one '*'.")
697
+ elif "*" not in path:
698
+ path = os.path.join(path, "*.part")
699
+
700
+ if name_function is None:
701
+ name_function = build_name_function(num - 1)
702
+
703
+ paths = [path.replace("*", name_function(i)) for i in range(num)]
704
+ if paths != sorted(paths):
705
+ logger.warning(
706
+ "In order to preserve order between partitions"
707
+ " paths created with ``name_function`` should "
708
+ "sort to partition order"
709
+ )
710
+ elif isinstance(path, (tuple, list)):
711
+ assert len(path) == num
712
+ paths = list(path)
713
+ else:
714
+ raise ValueError(
715
+ "Path should be either\n"
716
+ "1. A list of paths: ['foo.json', 'bar.json', ...]\n"
717
+ "2. A directory: 'foo/\n"
718
+ "3. A path with a '*' in it: 'foo.*.json'"
719
+ )
720
+ return paths
721
+
722
+
723
+ class PickleableTextIOWrapper(io.TextIOWrapper):
724
+ """TextIOWrapper cannot be pickled. This solves it.
725
+
726
+ Requires that ``buffer`` be pickleable, which all instances of
727
+ AbstractBufferedFile are.
728
+ """
729
+
730
+ def __init__(
731
+ self,
732
+ buffer,
733
+ encoding=None,
734
+ errors=None,
735
+ newline=None,
736
+ line_buffering=False,
737
+ write_through=False,
738
+ ):
739
+ self.args = buffer, encoding, errors, newline, line_buffering, write_through
740
+ super().__init__(*self.args)
741
+
742
+ def __reduce__(self):
743
+ return PickleableTextIOWrapper, self.args
.venv/lib/python3.11/site-packages/fsspec/dircache.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from collections.abc import MutableMapping
3
+ from functools import lru_cache
4
+
5
+
6
+ class DirCache(MutableMapping):
7
+ """
8
+ Caching of directory listings, in a structure like::
9
+
10
+ {"path0": [
11
+ {"name": "path0/file0",
12
+ "size": 123,
13
+ "type": "file",
14
+ ...
15
+ },
16
+ {"name": "path0/file1",
17
+ },
18
+ ...
19
+ ],
20
+ "path1": [...]
21
+ }
22
+
23
+ Parameters to this class control listing expiry or indeed turn
24
+ caching off
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ use_listings_cache=True,
30
+ listings_expiry_time=None,
31
+ max_paths=None,
32
+ **kwargs,
33
+ ):
34
+ """
35
+
36
+ Parameters
37
+ ----------
38
+ use_listings_cache: bool
39
+ If False, this cache never returns items, but always reports KeyError,
40
+ and setting items has no effect
41
+ listings_expiry_time: int or float (optional)
42
+ Time in seconds that a listing is considered valid. If None,
43
+ listings do not expire.
44
+ max_paths: int (optional)
45
+ The number of most recent listings that are considered valid; 'recent'
46
+ refers to when the entry was set.
47
+ """
48
+ self._cache = {}
49
+ self._times = {}
50
+ if max_paths:
51
+ self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
52
+ self.use_listings_cache = use_listings_cache
53
+ self.listings_expiry_time = listings_expiry_time
54
+ self.max_paths = max_paths
55
+
56
+ def __getitem__(self, item):
57
+ if self.listings_expiry_time is not None:
58
+ if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
59
+ del self._cache[item]
60
+ if self.max_paths:
61
+ self._q(item)
62
+ return self._cache[item] # maybe raises KeyError
63
+
64
+ def clear(self):
65
+ self._cache.clear()
66
+
67
+ def __len__(self):
68
+ return len(self._cache)
69
+
70
+ def __contains__(self, item):
71
+ try:
72
+ self[item]
73
+ return True
74
+ except KeyError:
75
+ return False
76
+
77
+ def __setitem__(self, key, value):
78
+ if not self.use_listings_cache:
79
+ return
80
+ if self.max_paths:
81
+ self._q(key)
82
+ self._cache[key] = value
83
+ if self.listings_expiry_time is not None:
84
+ self._times[key] = time.time()
85
+
86
+ def __delitem__(self, key):
87
+ del self._cache[key]
88
+
89
+ def __iter__(self):
90
+ entries = list(self._cache)
91
+
92
+ return (k for k in entries if k in self)
93
+
94
+ def __reduce__(self):
95
+ return (
96
+ DirCache,
97
+ (self.use_listings_cache, self.listings_expiry_time, self.max_paths),
98
+ )
.venv/lib/python3.11/site-packages/fsspec/exceptions.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ fsspec user-defined exception classes
3
+ """
4
+
5
+ import asyncio
6
+
7
+
8
+ class BlocksizeMismatchError(ValueError):
9
+ """
10
+ Raised when a cached file is opened with a different blocksize than it was
11
+ written with
12
+ """
13
+
14
+
15
+ class FSTimeoutError(asyncio.TimeoutError):
16
+ """
17
+ Raised when a fsspec function timed out occurs
18
+ """
.venv/lib/python3.11/site-packages/fsspec/fuse.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import stat
5
+ import threading
6
+ import time
7
+ from errno import EIO, ENOENT
8
+
9
+ from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
10
+
11
+ from fsspec import __version__
12
+ from fsspec.core import url_to_fs
13
+
14
+ logger = logging.getLogger("fsspec.fuse")
15
+
16
+
17
+ class FUSEr(Operations):
18
+ def __init__(self, fs, path, ready_file=False):
19
+ self.fs = fs
20
+ self.cache = {}
21
+ self.root = path.rstrip("/") + "/"
22
+ self.counter = 0
23
+ logger.info("Starting FUSE at %s", path)
24
+ self._ready_file = ready_file
25
+
26
+ def getattr(self, path, fh=None):
27
+ logger.debug("getattr %s", path)
28
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
29
+ return {"type": "file", "st_size": 5}
30
+
31
+ path = "".join([self.root, path.lstrip("/")]).rstrip("/")
32
+ try:
33
+ info = self.fs.info(path)
34
+ except FileNotFoundError as exc:
35
+ raise FuseOSError(ENOENT) from exc
36
+
37
+ data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
38
+ perm = info.get("mode", 0o777)
39
+
40
+ if info["type"] != "file":
41
+ data["st_mode"] = stat.S_IFDIR | perm
42
+ data["st_size"] = 0
43
+ data["st_blksize"] = 0
44
+ else:
45
+ data["st_mode"] = stat.S_IFREG | perm
46
+ data["st_size"] = info["size"]
47
+ data["st_blksize"] = 5 * 2**20
48
+ data["st_nlink"] = 1
49
+ data["st_atime"] = info["atime"] if "atime" in info else time.time()
50
+ data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
51
+ data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
52
+ return data
53
+
54
+ def readdir(self, path, fh):
55
+ logger.debug("readdir %s", path)
56
+ path = "".join([self.root, path.lstrip("/")])
57
+ files = self.fs.ls(path, False)
58
+ files = [os.path.basename(f.rstrip("/")) for f in files]
59
+ return [".", ".."] + files
60
+
61
+ def mkdir(self, path, mode):
62
+ path = "".join([self.root, path.lstrip("/")])
63
+ self.fs.mkdir(path)
64
+ return 0
65
+
66
+ def rmdir(self, path):
67
+ path = "".join([self.root, path.lstrip("/")])
68
+ self.fs.rmdir(path)
69
+ return 0
70
+
71
+ def read(self, path, size, offset, fh):
72
+ logger.debug("read %s", (path, size, offset))
73
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
74
+ # status indicator
75
+ return b"ready"
76
+
77
+ f = self.cache[fh]
78
+ f.seek(offset)
79
+ out = f.read(size)
80
+ return out
81
+
82
+ def write(self, path, data, offset, fh):
83
+ logger.debug("write %s", (path, offset))
84
+ f = self.cache[fh]
85
+ f.seek(offset)
86
+ f.write(data)
87
+ return len(data)
88
+
89
+ def create(self, path, flags, fi=None):
90
+ logger.debug("create %s", (path, flags))
91
+ fn = "".join([self.root, path.lstrip("/")])
92
+ self.fs.touch(fn) # OS will want to get attributes immediately
93
+ f = self.fs.open(fn, "wb")
94
+ self.cache[self.counter] = f
95
+ self.counter += 1
96
+ return self.counter - 1
97
+
98
+ def open(self, path, flags):
99
+ logger.debug("open %s", (path, flags))
100
+ fn = "".join([self.root, path.lstrip("/")])
101
+ if flags % 2 == 0:
102
+ # read
103
+ mode = "rb"
104
+ else:
105
+ # write/create
106
+ mode = "wb"
107
+ self.cache[self.counter] = self.fs.open(fn, mode)
108
+ self.counter += 1
109
+ return self.counter - 1
110
+
111
+ def truncate(self, path, length, fh=None):
112
+ fn = "".join([self.root, path.lstrip("/")])
113
+ if length != 0:
114
+ raise NotImplementedError
115
+ # maybe should be no-op since open with write sets size to zero anyway
116
+ self.fs.touch(fn)
117
+
118
+ def unlink(self, path):
119
+ fn = "".join([self.root, path.lstrip("/")])
120
+ try:
121
+ self.fs.rm(fn, False)
122
+ except (OSError, FileNotFoundError) as exc:
123
+ raise FuseOSError(EIO) from exc
124
+
125
+ def release(self, path, fh):
126
+ try:
127
+ if fh in self.cache:
128
+ f = self.cache[fh]
129
+ f.close()
130
+ self.cache.pop(fh)
131
+ except Exception as e:
132
+ print(e)
133
+ return 0
134
+
135
+ def chmod(self, path, mode):
136
+ if hasattr(self.fs, "chmod"):
137
+ path = "".join([self.root, path.lstrip("/")])
138
+ return self.fs.chmod(path, mode)
139
+ raise NotImplementedError
140
+
141
+
142
+ def run(
143
+ fs,
144
+ path,
145
+ mount_point,
146
+ foreground=True,
147
+ threads=False,
148
+ ready_file=False,
149
+ ops_class=FUSEr,
150
+ ):
151
+ """Mount stuff in a local directory
152
+
153
+ This uses fusepy to make it appear as if a given path on an fsspec
154
+ instance is in fact resident within the local file-system.
155
+
156
+ This requires that fusepy by installed, and that FUSE be available on
157
+ the system (typically requiring a package to be installed with
158
+ apt, yum, brew, etc.).
159
+
160
+ Parameters
161
+ ----------
162
+ fs: file-system instance
163
+ From one of the compatible implementations
164
+ path: str
165
+ Location on that file-system to regard as the root directory to
166
+ mount. Note that you typically should include the terminating "/"
167
+ character.
168
+ mount_point: str
169
+ An empty directory on the local file-system where the contents of
170
+ the remote path will appear.
171
+ foreground: bool
172
+ Whether or not calling this function will block. Operation will
173
+ typically be more stable if True.
174
+ threads: bool
175
+ Whether or not to create threads when responding to file operations
176
+ within the mounter directory. Operation will typically be more
177
+ stable if False.
178
+ ready_file: bool
179
+ Whether the FUSE process is ready. The ``.fuse_ready`` file will
180
+ exist in the ``mount_point`` directory if True. Debugging purpose.
181
+ ops_class: FUSEr or Subclass of FUSEr
182
+ To override the default behavior of FUSEr. For Example, logging
183
+ to file.
184
+
185
+ """
186
+ func = lambda: FUSE(
187
+ ops_class(fs, path, ready_file=ready_file),
188
+ mount_point,
189
+ nothreads=not threads,
190
+ foreground=foreground,
191
+ )
192
+ if not foreground:
193
+ th = threading.Thread(target=func)
194
+ th.daemon = True
195
+ th.start()
196
+ return th
197
+ else: # pragma: no cover
198
+ try:
199
+ func()
200
+ except KeyboardInterrupt:
201
+ pass
202
+
203
+
204
+ def main(args):
205
+ """Mount filesystem from chained URL to MOUNT_POINT.
206
+
207
+ Examples:
208
+
209
+ python3 -m fsspec.fuse memory /usr/share /tmp/mem
210
+
211
+ python3 -m fsspec.fuse local /tmp/source /tmp/local \\
212
+ -l /tmp/fsspecfuse.log
213
+
214
+ You can also mount chained-URLs and use special settings:
215
+
216
+ python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
217
+ / /tmp/zip \\
218
+ -o 'filecache-cache_storage=/tmp/simplecache'
219
+
220
+ You can specify the type of the setting by using `[int]` or `[bool]`,
221
+ (`true`, `yes`, `1` represents the Boolean value `True`):
222
+
223
+ python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
224
+ /historic/packages/RPMS /tmp/ftp \\
225
+ -o 'simplecache-cache_storage=/tmp/simplecache' \\
226
+ -o 'simplecache-check_files=false[bool]' \\
227
+ -o 'ftp-listings_expiry_time=60[int]' \\
228
+ -o 'ftp-username=anonymous' \\
229
+ -o 'ftp-password=xieyanbo'
230
+ """
231
+
232
+ class RawDescriptionArgumentParser(argparse.ArgumentParser):
233
+ def format_help(self):
234
+ usage = super().format_help()
235
+ parts = usage.split("\n\n")
236
+ parts[1] = self.description.rstrip()
237
+ return "\n\n".join(parts)
238
+
239
+ parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
240
+ parser.add_argument("--version", action="version", version=__version__)
241
+ parser.add_argument("url", type=str, help="fs url")
242
+ parser.add_argument("source_path", type=str, help="source directory in fs")
243
+ parser.add_argument("mount_point", type=str, help="local directory")
244
+ parser.add_argument(
245
+ "-o",
246
+ "--option",
247
+ action="append",
248
+ help="Any options of protocol included in the chained URL",
249
+ )
250
+ parser.add_argument(
251
+ "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
252
+ )
253
+ parser.add_argument(
254
+ "-f",
255
+ "--foreground",
256
+ action="store_false",
257
+ help="Running in foreground or not (Default: False)",
258
+ )
259
+ parser.add_argument(
260
+ "-t",
261
+ "--threads",
262
+ action="store_false",
263
+ help="Running with threads support (Default: False)",
264
+ )
265
+ parser.add_argument(
266
+ "-r",
267
+ "--ready-file",
268
+ action="store_false",
269
+ help="The `.fuse_ready` file will exist after FUSE is ready. "
270
+ "(Debugging purpose, Default: False)",
271
+ )
272
+ args = parser.parse_args(args)
273
+
274
+ kwargs = {}
275
+ for item in args.option or []:
276
+ key, sep, value = item.partition("=")
277
+ if not sep:
278
+ parser.error(message=f"Wrong option: {item!r}")
279
+ val = value.lower()
280
+ if val.endswith("[int]"):
281
+ value = int(value[: -len("[int]")])
282
+ elif val.endswith("[bool]"):
283
+ value = val[: -len("[bool]")] in ["1", "yes", "true"]
284
+
285
+ if "-" in key:
286
+ fs_name, setting_name = key.split("-", 1)
287
+ if fs_name in kwargs:
288
+ kwargs[fs_name][setting_name] = value
289
+ else:
290
+ kwargs[fs_name] = {setting_name: value}
291
+ else:
292
+ kwargs[key] = value
293
+
294
+ if args.log_file:
295
+ logging.basicConfig(
296
+ level=logging.DEBUG,
297
+ filename=args.log_file,
298
+ format="%(asctime)s %(message)s",
299
+ )
300
+
301
+ class LoggingFUSEr(FUSEr, LoggingMixIn):
302
+ pass
303
+
304
+ fuser = LoggingFUSEr
305
+ else:
306
+ fuser = FUSEr
307
+
308
+ fs, url_path = url_to_fs(args.url, **kwargs)
309
+ logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
310
+ run(
311
+ fs,
312
+ args.source_path,
313
+ args.mount_point,
314
+ foreground=args.foreground,
315
+ threads=args.threads,
316
+ ready_file=args.ready_file,
317
+ ops_class=fuser,
318
+ )
319
+
320
+
321
+ if __name__ == "__main__":
322
+ import sys
323
+
324
+ main(sys.argv[1:])
.venv/lib/python3.11/site-packages/fsspec/generic.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from typing import Optional
9
+
10
+ from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
11
+ from .callbacks import DEFAULT_CALLBACK
12
+ from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
13
+
14
+ _generic_fs = {}
15
+ logger = logging.getLogger("fsspec.generic")
16
+
17
+
18
+ def set_generic_fs(protocol, **storage_options):
19
+ _generic_fs[protocol] = filesystem(protocol, **storage_options)
20
+
21
+
22
+ default_method = "default"
23
+
24
+
25
+ def _resolve_fs(url, method=None, protocol=None, storage_options=None):
26
+ """Pick instance of backend FS"""
27
+ method = method or default_method
28
+ protocol = protocol or split_protocol(url)[0]
29
+ storage_options = storage_options or {}
30
+ if method == "default":
31
+ return filesystem(protocol)
32
+ if method == "generic":
33
+ return _generic_fs[protocol]
34
+ if method == "current":
35
+ cls = get_filesystem_class(protocol)
36
+ return cls.current()
37
+ if method == "options":
38
+ fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
39
+ return fs
40
+ raise ValueError(f"Unknown FS resolution method: {method}")
41
+
42
+
43
+ def rsync(
44
+ source,
45
+ destination,
46
+ delete_missing=False,
47
+ source_field="size",
48
+ dest_field="size",
49
+ update_cond="different",
50
+ inst_kwargs=None,
51
+ fs=None,
52
+ **kwargs,
53
+ ):
54
+ """Sync files between two directory trees
55
+
56
+ (experimental)
57
+
58
+ Parameters
59
+ ----------
60
+ source: str
61
+ Root of the directory tree to take files from. This must be a directory, but
62
+ do not include any terminating "/" character
63
+ destination: str
64
+ Root path to copy into. The contents of this location should be
65
+ identical to the contents of ``source`` when done. This will be made a
66
+ directory, and the terminal "/" should not be included.
67
+ delete_missing: bool
68
+ If there are paths in the destination that don't exist in the
69
+ source and this is True, delete them. Otherwise, leave them alone.
70
+ source_field: str | callable
71
+ If ``update_field`` is "different", this is the key in the info
72
+ of source files to consider for difference. Maybe a function of the
73
+ info dict.
74
+ dest_field: str | callable
75
+ If ``update_field`` is "different", this is the key in the info
76
+ of destination files to consider for difference. May be a function of
77
+ the info dict.
78
+ update_cond: "different"|"always"|"never"
79
+ If "always", every file is copied, regardless of whether it exists in
80
+ the destination. If "never", files that exist in the destination are
81
+ not copied again. If "different" (default), only copy if the info
82
+ fields given by ``source_field`` and ``dest_field`` (usually "size")
83
+ are different. Other comparisons may be added in the future.
84
+ inst_kwargs: dict|None
85
+ If ``fs`` is None, use this set of keyword arguments to make a
86
+ GenericFileSystem instance
87
+ fs: GenericFileSystem|None
88
+ Instance to use if explicitly given. The instance defines how to
89
+ to make downstream file system instances from paths.
90
+
91
+ Returns
92
+ -------
93
+ dict of the copy operations that were performed, {source: destination}
94
+ """
95
+ fs = fs or GenericFileSystem(**(inst_kwargs or {}))
96
+ source = fs._strip_protocol(source)
97
+ destination = fs._strip_protocol(destination)
98
+ allfiles = fs.find(source, withdirs=True, detail=True)
99
+ if not fs.isdir(source):
100
+ raise ValueError("Can only rsync on a directory")
101
+ otherfiles = fs.find(destination, withdirs=True, detail=True)
102
+ dirs = [
103
+ a
104
+ for a, v in allfiles.items()
105
+ if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
106
+ ]
107
+ logger.debug(f"{len(dirs)} directories to create")
108
+ if dirs:
109
+ fs.make_many_dirs(
110
+ [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
111
+ )
112
+ allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
113
+ logger.debug(f"{len(allfiles)} files to consider for copy")
114
+ to_delete = [
115
+ o
116
+ for o, v in otherfiles.items()
117
+ if o.replace(destination, source) not in allfiles and v["type"] == "file"
118
+ ]
119
+ for k, v in allfiles.copy().items():
120
+ otherfile = k.replace(source, destination)
121
+ if otherfile in otherfiles:
122
+ if update_cond == "always":
123
+ allfiles[k] = otherfile
124
+ elif update_cond == "different":
125
+ inf1 = source_field(v) if callable(source_field) else v[source_field]
126
+ v2 = otherfiles[otherfile]
127
+ inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
128
+ if inf1 != inf2:
129
+ # details mismatch, make copy
130
+ allfiles[k] = otherfile
131
+ else:
132
+ # details match, don't copy
133
+ allfiles.pop(k)
134
+ else:
135
+ # file not in target yet
136
+ allfiles[k] = otherfile
137
+ logger.debug(f"{len(allfiles)} files to copy")
138
+ if allfiles:
139
+ source_files, target_files = zip(*allfiles.items())
140
+ fs.cp(source_files, target_files, **kwargs)
141
+ logger.debug(f"{len(to_delete)} files to delete")
142
+ if delete_missing and to_delete:
143
+ fs.rm(to_delete)
144
+ return allfiles
145
+
146
+
147
+ class GenericFileSystem(AsyncFileSystem):
148
+ """Wrapper over all other FS types
149
+
150
+ <experimental!>
151
+
152
+ This implementation is a single unified interface to be able to run FS operations
153
+ over generic URLs, and dispatch to the specific implementations using the URL
154
+ protocol prefix.
155
+
156
+ Note: instances of this FS are always async, even if you never use it with any async
157
+ backend.
158
+ """
159
+
160
+ protocol = "generic" # there is no real reason to ever use a protocol with this FS
161
+
162
+ def __init__(self, default_method="default", **kwargs):
163
+ """
164
+
165
+ Parameters
166
+ ----------
167
+ default_method: str (optional)
168
+ Defines how to configure backend FS instances. Options are:
169
+ - "default": instantiate like FSClass(), with no
170
+ extra arguments; this is the default instance of that FS, and can be
171
+ configured via the config system
172
+ - "generic": takes instances from the `_generic_fs` dict in this module,
173
+ which you must populate before use. Keys are by protocol
174
+ - "current": takes the most recently instantiated version of each FS
175
+ """
176
+ self.method = default_method
177
+ super().__init__(**kwargs)
178
+
179
+ def _parent(self, path):
180
+ fs = _resolve_fs(path, self.method)
181
+ return fs.unstrip_protocol(fs._parent(path))
182
+
183
+ def _strip_protocol(self, path):
184
+ # normalization only
185
+ fs = _resolve_fs(path, self.method)
186
+ return fs.unstrip_protocol(fs._strip_protocol(path))
187
+
188
+ async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
189
+ fs = _resolve_fs(path, self.method)
190
+ if fs.async_impl:
191
+ out = await fs._find(
192
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
193
+ )
194
+ else:
195
+ out = fs.find(
196
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
197
+ )
198
+ result = {}
199
+ for k, v in out.items():
200
+ v = v.copy() # don't corrupt target FS dircache
201
+ name = fs.unstrip_protocol(k)
202
+ v["name"] = name
203
+ result[name] = v
204
+ if detail:
205
+ return result
206
+ return list(result)
207
+
208
+ async def _info(self, url, **kwargs):
209
+ fs = _resolve_fs(url, self.method)
210
+ if fs.async_impl:
211
+ out = await fs._info(url, **kwargs)
212
+ else:
213
+ out = fs.info(url, **kwargs)
214
+ out = out.copy() # don't edit originals
215
+ out["name"] = fs.unstrip_protocol(out["name"])
216
+ return out
217
+
218
+ async def _ls(
219
+ self,
220
+ url,
221
+ detail=True,
222
+ **kwargs,
223
+ ):
224
+ fs = _resolve_fs(url, self.method)
225
+ if fs.async_impl:
226
+ out = await fs._ls(url, detail=True, **kwargs)
227
+ else:
228
+ out = fs.ls(url, detail=True, **kwargs)
229
+ out = [o.copy() for o in out] # don't edit originals
230
+ for o in out:
231
+ o["name"] = fs.unstrip_protocol(o["name"])
232
+ if detail:
233
+ return out
234
+ else:
235
+ return [o["name"] for o in out]
236
+
237
+ async def _cat_file(
238
+ self,
239
+ url,
240
+ **kwargs,
241
+ ):
242
+ fs = _resolve_fs(url, self.method)
243
+ if fs.async_impl:
244
+ return await fs._cat_file(url, **kwargs)
245
+ else:
246
+ return fs.cat_file(url, **kwargs)
247
+
248
+ async def _pipe_file(
249
+ self,
250
+ path,
251
+ value,
252
+ **kwargs,
253
+ ):
254
+ fs = _resolve_fs(path, self.method)
255
+ if fs.async_impl:
256
+ return await fs._pipe_file(path, value, **kwargs)
257
+ else:
258
+ return fs.pipe_file(path, value, **kwargs)
259
+
260
+ async def _rm(self, url, **kwargs):
261
+ urls = url
262
+ if isinstance(urls, str):
263
+ urls = [urls]
264
+ fs = _resolve_fs(urls[0], self.method)
265
+ if fs.async_impl:
266
+ await fs._rm(urls, **kwargs)
267
+ else:
268
+ fs.rm(url, **kwargs)
269
+
270
+ async def _makedirs(self, path, exist_ok=False):
271
+ logger.debug("Make dir %s", path)
272
+ fs = _resolve_fs(path, self.method)
273
+ if fs.async_impl:
274
+ await fs._makedirs(path, exist_ok=exist_ok)
275
+ else:
276
+ fs.makedirs(path, exist_ok=exist_ok)
277
+
278
+ def rsync(self, source, destination, **kwargs):
279
+ """Sync files between two directory trees
280
+
281
+ See `func:rsync` for more details.
282
+ """
283
+ rsync(source, destination, fs=self, **kwargs)
284
+
285
+ async def _cp_file(
286
+ self,
287
+ url,
288
+ url2,
289
+ blocksize=2**20,
290
+ callback=DEFAULT_CALLBACK,
291
+ **kwargs,
292
+ ):
293
+ fs = _resolve_fs(url, self.method)
294
+ fs2 = _resolve_fs(url2, self.method)
295
+ if fs is fs2:
296
+ # pure remote
297
+ if fs.async_impl:
298
+ return await fs._cp_file(url, url2, **kwargs)
299
+ else:
300
+ return fs.cp_file(url, url2, **kwargs)
301
+ kw = {"blocksize": 0, "cache_type": "none"}
302
+ try:
303
+ f1 = (
304
+ await fs.open_async(url, "rb")
305
+ if hasattr(fs, "open_async")
306
+ else fs.open(url, "rb", **kw)
307
+ )
308
+ callback.set_size(await maybe_await(f1.size))
309
+ f2 = (
310
+ await fs2.open_async(url2, "wb")
311
+ if hasattr(fs2, "open_async")
312
+ else fs2.open(url2, "wb", **kw)
313
+ )
314
+ while f1.size is None or f2.tell() < f1.size:
315
+ data = await maybe_await(f1.read(blocksize))
316
+ if f1.size is None and not data:
317
+ break
318
+ await maybe_await(f2.write(data))
319
+ callback.absolute_update(f2.tell())
320
+ finally:
321
+ try:
322
+ await maybe_await(f2.close())
323
+ await maybe_await(f1.close())
324
+ except NameError:
325
+ # fail while opening f1 or f2
326
+ pass
327
+
328
+ async def _make_many_dirs(self, urls, exist_ok=True):
329
+ fs = _resolve_fs(urls[0], self.method)
330
+ if fs.async_impl:
331
+ coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
332
+ await _run_coros_in_chunks(coros)
333
+ else:
334
+ for u in urls:
335
+ fs.makedirs(u, exist_ok=exist_ok)
336
+
337
+ make_many_dirs = sync_wrapper(_make_many_dirs)
338
+
339
+ async def _copy(
340
+ self,
341
+ path1: list[str],
342
+ path2: list[str],
343
+ recursive: bool = False,
344
+ on_error: str = "ignore",
345
+ maxdepth: Optional[int] = None,
346
+ batch_size: Optional[int] = None,
347
+ tempdir: Optional[str] = None,
348
+ **kwargs,
349
+ ):
350
+ if recursive:
351
+ raise NotImplementedError
352
+ fs = _resolve_fs(path1[0], self.method)
353
+ fs2 = _resolve_fs(path2[0], self.method)
354
+ # not expanding paths atm., assume call is from rsync()
355
+ if fs is fs2:
356
+ # pure remote
357
+ if fs.async_impl:
358
+ return await fs._copy(path1, path2, **kwargs)
359
+ else:
360
+ return fs.copy(path1, path2, **kwargs)
361
+ await copy_file_op(
362
+ fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
363
+ )
364
+
365
+
366
+ async def copy_file_op(
367
+ fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
368
+ ):
369
+ import tempfile
370
+
371
+ tempdir = tempdir or tempfile.mkdtemp()
372
+ try:
373
+ coros = [
374
+ _copy_file_op(
375
+ fs1,
376
+ u1,
377
+ fs2,
378
+ u2,
379
+ os.path.join(tempdir, uuid.uuid4().hex),
380
+ on_error=on_error,
381
+ )
382
+ for u1, u2 in zip(url1, url2)
383
+ ]
384
+ await _run_coros_in_chunks(coros, batch_size=batch_size)
385
+ finally:
386
+ shutil.rmtree(tempdir)
387
+
388
+
389
+ async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
390
+ ex = () if on_error == "raise" else Exception
391
+ logger.debug("Copy %s -> %s", url1, url2)
392
+ try:
393
+ if fs1.async_impl:
394
+ await fs1._get_file(url1, local)
395
+ else:
396
+ fs1.get_file(url1, local)
397
+ if fs2.async_impl:
398
+ await fs2._put_file(local, url2)
399
+ else:
400
+ fs2.put_file(local, url2)
401
+ os.unlink(local)
402
+ logger.debug("Copy %s -> %s; done", url1, url2)
403
+ except ex as e:
404
+ logger.debug("ignoring cp exception for %s: %s", url1, e)
405
+
406
+
407
+ async def maybe_await(cor):
408
+ if inspect.iscoroutine(cor):
409
+ return await cor
410
+ else:
411
+ return cor
.venv/lib/python3.11/site-packages/fsspec/gui.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import contextlib
3
+ import logging
4
+ import os
5
+ import re
6
+ from typing import ClassVar, Sequence
7
+
8
+ import panel as pn
9
+
10
+ from .core import OpenFile, get_filesystem_class, split_protocol
11
+ from .registry import known_implementations
12
+
13
+ pn.extension()
14
+ logger = logging.getLogger("fsspec.gui")
15
+
16
+
17
+ class SigSlot:
18
+ """Signal-slot mixin, for Panel event passing
19
+
20
+ Include this class in a widget manager's superclasses to be able to
21
+ register events and callbacks on Panel widgets managed by that class.
22
+
23
+ The method ``_register`` should be called as widgets are added, and external
24
+ code should call ``connect`` to associate callbacks.
25
+
26
+ By default, all signals emit a DEBUG logging statement.
27
+ """
28
+
29
+ # names of signals that this class may emit each of which must be
30
+ # set by _register for any new instance
31
+ signals: ClassVar[Sequence[str]] = []
32
+ # names of actions that this class may respond to
33
+ slots: ClassVar[Sequence[str]] = []
34
+
35
+ # each of which must be a method name
36
+
37
+ def __init__(self):
38
+ self._ignoring_events = False
39
+ self._sigs = {}
40
+ self._map = {}
41
+ self._setup()
42
+
43
+ def _setup(self):
44
+ """Create GUI elements and register signals"""
45
+ self.panel = pn.pane.PaneBase()
46
+ # no signals to set up in the base class
47
+
48
+ def _register(
49
+ self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
50
+ ):
51
+ """Watch the given attribute of a widget and assign it a named event
52
+
53
+ This is normally called at the time a widget is instantiated, in the
54
+ class which owns it.
55
+
56
+ Parameters
57
+ ----------
58
+ widget : pn.layout.Panel or None
59
+ Widget to watch. If None, an anonymous signal not associated with
60
+ any widget.
61
+ name : str
62
+ Name of this event
63
+ thing : str
64
+ Attribute of the given widget to watch
65
+ log_level : int
66
+ When the signal is triggered, a logging event of the given level
67
+ will be fired in the dfviz logger.
68
+ auto : bool
69
+ If True, automatically connects with a method in this class of the
70
+ same name.
71
+ """
72
+ if name not in self.signals:
73
+ raise ValueError(f"Attempt to assign an undeclared signal: {name}")
74
+ self._sigs[name] = {
75
+ "widget": widget,
76
+ "callbacks": [],
77
+ "thing": thing,
78
+ "log": log_level,
79
+ }
80
+ wn = "-".join(
81
+ [
82
+ getattr(widget, "name", str(widget)) if widget is not None else "none",
83
+ thing,
84
+ ]
85
+ )
86
+ self._map[wn] = name
87
+ if widget is not None:
88
+ widget.param.watch(self._signal, thing, onlychanged=True)
89
+ if auto and hasattr(self, name):
90
+ self.connect(name, getattr(self, name))
91
+
92
+ def _repr_mimebundle_(self, *args, **kwargs):
93
+ """Display in a notebook or a server"""
94
+ try:
95
+ return self.panel._repr_mimebundle_(*args, **kwargs)
96
+ except (ValueError, AttributeError) as exc:
97
+ raise NotImplementedError(
98
+ "Panel does not seem to be set up properly"
99
+ ) from exc
100
+
101
+ def connect(self, signal, slot):
102
+ """Associate call back with given event
103
+
104
+ The callback must be a function which takes the "new" value of the
105
+ watched attribute as the only parameter. If the callback return False,
106
+ this cancels any further processing of the given event.
107
+
108
+ Alternatively, the callback can be a string, in which case it means
109
+ emitting the correspondingly-named event (i.e., connect to self)
110
+ """
111
+ self._sigs[signal]["callbacks"].append(slot)
112
+
113
+ def _signal(self, event):
114
+ """This is called by a an action on a widget
115
+
116
+ Within an self.ignore_events context, nothing happens.
117
+
118
+ Tests can execute this method by directly changing the values of
119
+ widget components.
120
+ """
121
+ if not self._ignoring_events:
122
+ wn = "-".join([event.obj.name, event.name])
123
+ if wn in self._map and self._map[wn] in self._sigs:
124
+ self._emit(self._map[wn], event.new)
125
+
126
+ @contextlib.contextmanager
127
+ def ignore_events(self):
128
+ """Temporarily turn off events processing in this instance
129
+
130
+ (does not propagate to children)
131
+ """
132
+ self._ignoring_events = True
133
+ try:
134
+ yield
135
+ finally:
136
+ self._ignoring_events = False
137
+
138
+ def _emit(self, sig, value=None):
139
+ """An event happened, call its callbacks
140
+
141
+ This method can be used in tests to simulate message passing without
142
+ directly changing visual elements.
143
+
144
+ Calling of callbacks will halt whenever one returns False.
145
+ """
146
+ logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
147
+ for callback in self._sigs[sig]["callbacks"]:
148
+ if isinstance(callback, str):
149
+ self._emit(callback)
150
+ else:
151
+ try:
152
+ # running callbacks should not break the interface
153
+ ret = callback(value)
154
+ if ret is False:
155
+ break
156
+ except Exception as e:
157
+ logger.exception(
158
+ "Exception (%s) while executing callback for signal: %s",
159
+ e,
160
+ sig,
161
+ )
162
+
163
+ def show(self, threads=False):
164
+ """Open a new browser tab and display this instance's interface"""
165
+ self.panel.show(threads=threads, verbose=False)
166
+ return self
167
+
168
+
169
+ class SingleSelect(SigSlot):
170
+ """A multiselect which only allows you to select one item for an event"""
171
+
172
+ signals = ["_selected", "selected"] # the first is internal
173
+ slots = ["set_options", "set_selection", "add", "clear", "select"]
174
+
175
+ def __init__(self, **kwargs):
176
+ self.kwargs = kwargs
177
+ super().__init__()
178
+
179
+ def _setup(self):
180
+ self.panel = pn.widgets.MultiSelect(**self.kwargs)
181
+ self._register(self.panel, "_selected", "value")
182
+ self._register(None, "selected")
183
+ self.connect("_selected", self.select_one)
184
+
185
+ def _signal(self, *args, **kwargs):
186
+ super()._signal(*args, **kwargs)
187
+
188
+ def select_one(self, *_):
189
+ with self.ignore_events():
190
+ val = [self.panel.value[-1]] if self.panel.value else []
191
+ self.panel.value = val
192
+ self._emit("selected", self.panel.value)
193
+
194
+ def set_options(self, options):
195
+ self.panel.options = options
196
+
197
+ def clear(self):
198
+ self.panel.options = []
199
+
200
+ @property
201
+ def value(self):
202
+ return self.panel.value
203
+
204
+ def set_selection(self, selection):
205
+ self.panel.value = [selection]
206
+
207
+
208
+ class FileSelector(SigSlot):
209
+ """Panel-based graphical file selector widget
210
+
211
+ Instances of this widget are interactive and can be displayed in jupyter by having
212
+ them as the output of a cell, or in a separate browser tab using ``.show()``.
213
+ """
214
+
215
+ signals = [
216
+ "protocol_changed",
217
+ "selection_changed",
218
+ "directory_entered",
219
+ "home_clicked",
220
+ "up_clicked",
221
+ "go_clicked",
222
+ "filters_changed",
223
+ ]
224
+ slots = ["set_filters", "go_home"]
225
+
226
+ def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
227
+ """
228
+
229
+ Parameters
230
+ ----------
231
+ url : str (optional)
232
+ Initial value of the URL to populate the dialog; should include protocol
233
+ filters : list(str) (optional)
234
+ File endings to include in the listings. If not included, all files are
235
+ allowed. Does not affect directories.
236
+ If given, the endings will appear as checkboxes in the interface
237
+ ignore : list(str) (optional)
238
+ Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
239
+ hidden files on posix
240
+ kwargs : dict (optional)
241
+ To pass to file system instance
242
+ """
243
+ if url:
244
+ self.init_protocol, url = split_protocol(url)
245
+ else:
246
+ self.init_protocol, url = "file", os.getcwd()
247
+ self.init_url = url
248
+ self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
249
+ self.filters = filters
250
+ self.ignore = [re.compile(i) for i in ignore or []]
251
+ self._fs = None
252
+ super().__init__()
253
+
254
+ def _setup(self):
255
+ self.url = pn.widgets.TextInput(
256
+ name="url",
257
+ value=self.init_url,
258
+ align="end",
259
+ sizing_mode="stretch_width",
260
+ width_policy="max",
261
+ )
262
+ self.protocol = pn.widgets.Select(
263
+ options=sorted(known_implementations),
264
+ value=self.init_protocol,
265
+ name="protocol",
266
+ align="center",
267
+ )
268
+ self.kwargs = pn.widgets.TextInput(
269
+ name="kwargs", value=self.init_kwargs, align="center"
270
+ )
271
+ self.go = pn.widgets.Button(name="⇨", align="end", width=45)
272
+ self.main = SingleSelect(size=10)
273
+ self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
274
+ self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
275
+
276
+ self._register(self.protocol, "protocol_changed", auto=True)
277
+ self._register(self.go, "go_clicked", "clicks", auto=True)
278
+ self._register(self.up, "up_clicked", "clicks", auto=True)
279
+ self._register(self.home, "home_clicked", "clicks", auto=True)
280
+ self._register(None, "selection_changed")
281
+ self.main.connect("selected", self.selection_changed)
282
+ self._register(None, "directory_entered")
283
+ self.prev_protocol = self.protocol.value
284
+ self.prev_kwargs = self.storage_options
285
+
286
+ self.filter_sel = pn.widgets.CheckBoxGroup(
287
+ value=[], options=[], inline=False, align="end", width_policy="min"
288
+ )
289
+ self._register(self.filter_sel, "filters_changed", auto=True)
290
+
291
+ self.panel = pn.Column(
292
+ pn.Row(self.protocol, self.kwargs),
293
+ pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
294
+ self.main.panel,
295
+ )
296
+ self.set_filters(self.filters)
297
+ self.go_clicked()
298
+
299
+ def set_filters(self, filters=None):
300
+ self.filters = filters
301
+ if filters:
302
+ self.filter_sel.options = filters
303
+ self.filter_sel.value = filters
304
+ else:
305
+ self.filter_sel.options = []
306
+ self.filter_sel.value = []
307
+
308
+ @property
309
+ def storage_options(self):
310
+ """Value of the kwargs box as a dictionary"""
311
+ return ast.literal_eval(self.kwargs.value) or {}
312
+
313
+ @property
314
+ def fs(self):
315
+ """Current filesystem instance"""
316
+ if self._fs is None:
317
+ cls = get_filesystem_class(self.protocol.value)
318
+ self._fs = cls(**self.storage_options)
319
+ return self._fs
320
+
321
+ @property
322
+ def urlpath(self):
323
+ """URL of currently selected item"""
324
+ return (
325
+ (f"{self.protocol.value}://{self.main.value[0]}")
326
+ if self.main.value
327
+ else None
328
+ )
329
+
330
+ def open_file(self, mode="rb", compression=None, encoding=None):
331
+ """Create OpenFile instance for the currently selected item
332
+
333
+ For example, in a notebook you might do something like
334
+
335
+ .. code-block::
336
+
337
+ [ ]: sel = FileSelector(); sel
338
+
339
+ # user selects their file
340
+
341
+ [ ]: with sel.open_file('rb') as f:
342
+ ... out = f.read()
343
+
344
+ Parameters
345
+ ----------
346
+ mode: str (optional)
347
+ Open mode for the file.
348
+ compression: str (optional)
349
+ The interact with the file as compressed. Set to 'infer' to guess
350
+ compression from the file ending
351
+ encoding: str (optional)
352
+ If using text mode, use this encoding; defaults to UTF8.
353
+ """
354
+ if self.urlpath is None:
355
+ raise ValueError("No file selected")
356
+ return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
357
+
358
+ def filters_changed(self, values):
359
+ self.filters = values
360
+ self.go_clicked()
361
+
362
+ def selection_changed(self, *_):
363
+ if self.urlpath is None:
364
+ return
365
+ if self.fs.isdir(self.urlpath):
366
+ self.url.value = self.fs._strip_protocol(self.urlpath)
367
+ self.go_clicked()
368
+
369
+ def go_clicked(self, *_):
370
+ if (
371
+ self.prev_protocol != self.protocol.value
372
+ or self.prev_kwargs != self.storage_options
373
+ ):
374
+ self._fs = None # causes fs to be recreated
375
+ self.prev_protocol = self.protocol.value
376
+ self.prev_kwargs = self.storage_options
377
+ listing = sorted(
378
+ self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
379
+ )
380
+ listing = [
381
+ l
382
+ for l in listing
383
+ if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
384
+ ]
385
+ folders = {
386
+ "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
387
+ for o in listing
388
+ if o["type"] == "directory"
389
+ }
390
+ files = {
391
+ "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
392
+ for o in listing
393
+ if o["type"] == "file"
394
+ }
395
+ if self.filters:
396
+ files = {
397
+ k: v
398
+ for k, v in files.items()
399
+ if any(v.endswith(ext) for ext in self.filters)
400
+ }
401
+ self.main.set_options(dict(**folders, **files))
402
+
403
+ def protocol_changed(self, *_):
404
+ self._fs = None
405
+ self.main.options = []
406
+ self.url.value = ""
407
+
408
+ def home_clicked(self, *_):
409
+ self.protocol.value = self.init_protocol
410
+ self.kwargs.value = self.init_kwargs
411
+ self.url.value = self.init_url
412
+ self.go_clicked()
413
+
414
+ def up_clicked(self, *_):
415
+ self.url.value = self.fs._parent(self.url.value)
416
+ self.go_clicked()
.venv/lib/python3.11/site-packages/fsspec/implementations/arrow.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import errno
2
+ import io
3
+ import os
4
+ import secrets
5
+ import shutil
6
+ from contextlib import suppress
7
+ from functools import cached_property, wraps
8
+ from urllib.parse import parse_qs
9
+
10
+ from fsspec.spec import AbstractFileSystem
11
+ from fsspec.utils import (
12
+ get_package_version_without_import,
13
+ infer_storage_options,
14
+ mirror_from,
15
+ tokenize,
16
+ )
17
+
18
+
19
+ def wrap_exceptions(func):
20
+ @wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ try:
23
+ return func(*args, **kwargs)
24
+ except OSError as exception:
25
+ if not exception.args:
26
+ raise
27
+
28
+ message, *args = exception.args
29
+ if isinstance(message, str) and "does not exist" in message:
30
+ raise FileNotFoundError(errno.ENOENT, message) from exception
31
+ else:
32
+ raise
33
+
34
+ return wrapper
35
+
36
+
37
+ PYARROW_VERSION = None
38
+
39
+
40
+ class ArrowFSWrapper(AbstractFileSystem):
41
+ """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
42
+
43
+ Parameters
44
+ ----------
45
+ fs : pyarrow.fs.FileSystem
46
+
47
+ """
48
+
49
+ root_marker = "/"
50
+
51
+ def __init__(self, fs, **kwargs):
52
+ global PYARROW_VERSION
53
+ PYARROW_VERSION = get_package_version_without_import("pyarrow")
54
+ self.fs = fs
55
+ super().__init__(**kwargs)
56
+
57
+ @property
58
+ def protocol(self):
59
+ return self.fs.type_name
60
+
61
+ @cached_property
62
+ def fsid(self):
63
+ return "hdfs_" + tokenize(self.fs.host, self.fs.port)
64
+
65
+ @classmethod
66
+ def _strip_protocol(cls, path):
67
+ ops = infer_storage_options(path)
68
+ path = ops["path"]
69
+ if path.startswith("//"):
70
+ # special case for "hdfs://path" (without the triple slash)
71
+ path = path[1:]
72
+ return path
73
+
74
+ def ls(self, path, detail=False, **kwargs):
75
+ path = self._strip_protocol(path)
76
+ from pyarrow.fs import FileSelector
77
+
78
+ entries = [
79
+ self._make_entry(entry)
80
+ for entry in self.fs.get_file_info(FileSelector(path))
81
+ ]
82
+ if detail:
83
+ return entries
84
+ else:
85
+ return [entry["name"] for entry in entries]
86
+
87
+ def info(self, path, **kwargs):
88
+ path = self._strip_protocol(path)
89
+ [info] = self.fs.get_file_info([path])
90
+ return self._make_entry(info)
91
+
92
+ def exists(self, path):
93
+ path = self._strip_protocol(path)
94
+ try:
95
+ self.info(path)
96
+ except FileNotFoundError:
97
+ return False
98
+ else:
99
+ return True
100
+
101
+ def _make_entry(self, info):
102
+ from pyarrow.fs import FileType
103
+
104
+ if info.type is FileType.Directory:
105
+ kind = "directory"
106
+ elif info.type is FileType.File:
107
+ kind = "file"
108
+ elif info.type is FileType.NotFound:
109
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
110
+ else:
111
+ kind = "other"
112
+
113
+ return {
114
+ "name": info.path,
115
+ "size": info.size,
116
+ "type": kind,
117
+ "mtime": info.mtime,
118
+ }
119
+
120
+ @wrap_exceptions
121
+ def cp_file(self, path1, path2, **kwargs):
122
+ path1 = self._strip_protocol(path1).rstrip("/")
123
+ path2 = self._strip_protocol(path2).rstrip("/")
124
+
125
+ with self._open(path1, "rb") as lstream:
126
+ tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
127
+ try:
128
+ with self.open(tmp_fname, "wb") as rstream:
129
+ shutil.copyfileobj(lstream, rstream)
130
+ self.fs.move(tmp_fname, path2)
131
+ except BaseException:
132
+ with suppress(FileNotFoundError):
133
+ self.fs.delete_file(tmp_fname)
134
+ raise
135
+
136
+ @wrap_exceptions
137
+ def mv(self, path1, path2, **kwargs):
138
+ path1 = self._strip_protocol(path1).rstrip("/")
139
+ path2 = self._strip_protocol(path2).rstrip("/")
140
+ self.fs.move(path1, path2)
141
+
142
+ @wrap_exceptions
143
+ def rm_file(self, path):
144
+ path = self._strip_protocol(path)
145
+ self.fs.delete_file(path)
146
+
147
+ @wrap_exceptions
148
+ def rm(self, path, recursive=False, maxdepth=None):
149
+ path = self._strip_protocol(path).rstrip("/")
150
+ if self.isdir(path):
151
+ if recursive:
152
+ self.fs.delete_dir(path)
153
+ else:
154
+ raise ValueError("Can't delete directories without recursive=False")
155
+ else:
156
+ self.fs.delete_file(path)
157
+
158
+ @wrap_exceptions
159
+ def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
160
+ if mode == "rb":
161
+ if seekable:
162
+ method = self.fs.open_input_file
163
+ else:
164
+ method = self.fs.open_input_stream
165
+ elif mode == "wb":
166
+ method = self.fs.open_output_stream
167
+ elif mode == "ab":
168
+ method = self.fs.open_append_stream
169
+ else:
170
+ raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
171
+
172
+ _kwargs = {}
173
+ if mode != "rb" or not seekable:
174
+ if int(PYARROW_VERSION.split(".")[0]) >= 4:
175
+ # disable compression auto-detection
176
+ _kwargs["compression"] = None
177
+ stream = method(path, **_kwargs)
178
+
179
+ return ArrowFile(self, stream, path, mode, block_size, **kwargs)
180
+
181
+ @wrap_exceptions
182
+ def mkdir(self, path, create_parents=True, **kwargs):
183
+ path = self._strip_protocol(path)
184
+ if create_parents:
185
+ self.makedirs(path, exist_ok=True)
186
+ else:
187
+ self.fs.create_dir(path, recursive=False)
188
+
189
+ @wrap_exceptions
190
+ def makedirs(self, path, exist_ok=False):
191
+ path = self._strip_protocol(path)
192
+ self.fs.create_dir(path, recursive=True)
193
+
194
+ @wrap_exceptions
195
+ def rmdir(self, path):
196
+ path = self._strip_protocol(path)
197
+ self.fs.delete_dir(path)
198
+
199
+ @wrap_exceptions
200
+ def modified(self, path):
201
+ path = self._strip_protocol(path)
202
+ return self.fs.get_file_info(path).mtime
203
+
204
+ def cat_file(self, path, start=None, end=None, **kwargs):
205
+ kwargs["seekable"] = start not in [None, 0]
206
+ return super().cat_file(path, start=None, end=None, **kwargs)
207
+
208
+ def get_file(self, rpath, lpath, **kwargs):
209
+ kwargs["seekable"] = False
210
+ super().get_file(rpath, lpath, **kwargs)
211
+
212
+
213
+ @mirror_from(
214
+ "stream",
215
+ [
216
+ "read",
217
+ "seek",
218
+ "tell",
219
+ "write",
220
+ "readable",
221
+ "writable",
222
+ "close",
223
+ "size",
224
+ "seekable",
225
+ ],
226
+ )
227
+ class ArrowFile(io.IOBase):
228
+ def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
229
+ self.path = path
230
+ self.mode = mode
231
+
232
+ self.fs = fs
233
+ self.stream = stream
234
+
235
+ self.blocksize = self.block_size = block_size
236
+ self.kwargs = kwargs
237
+
238
+ def __enter__(self):
239
+ return self
240
+
241
+ def __exit__(self, *args):
242
+ return self.close()
243
+
244
+
245
+ class HadoopFileSystem(ArrowFSWrapper):
246
+ """A wrapper on top of the pyarrow.fs.HadoopFileSystem
247
+ to connect it's interface with fsspec"""
248
+
249
+ protocol = "hdfs"
250
+
251
+ def __init__(
252
+ self,
253
+ host="default",
254
+ port=0,
255
+ user=None,
256
+ kerb_ticket=None,
257
+ replication=3,
258
+ extra_conf=None,
259
+ **kwargs,
260
+ ):
261
+ """
262
+
263
+ Parameters
264
+ ----------
265
+ host: str
266
+ Hostname, IP or "default" to try to read from Hadoop config
267
+ port: int
268
+ Port to connect on, or default from Hadoop config if 0
269
+ user: str or None
270
+ If given, connect as this username
271
+ kerb_ticket: str or None
272
+ If given, use this ticket for authentication
273
+ replication: int
274
+ set replication factor of file for write operations. default value is 3.
275
+ extra_conf: None or dict
276
+ Passed on to HadoopFileSystem
277
+ """
278
+ from pyarrow.fs import HadoopFileSystem
279
+
280
+ fs = HadoopFileSystem(
281
+ host=host,
282
+ port=port,
283
+ user=user,
284
+ kerb_ticket=kerb_ticket,
285
+ replication=replication,
286
+ extra_conf=extra_conf,
287
+ )
288
+ super().__init__(fs=fs, **kwargs)
289
+
290
+ @staticmethod
291
+ def _get_kwargs_from_urls(path):
292
+ ops = infer_storage_options(path)
293
+ out = {}
294
+ if ops.get("host", None):
295
+ out["host"] = ops["host"]
296
+ if ops.get("username", None):
297
+ out["user"] = ops["username"]
298
+ if ops.get("port", None):
299
+ out["port"] = ops["port"]
300
+ if ops.get("url_query", None):
301
+ queries = parse_qs(ops["url_query"])
302
+ if queries.get("replication", None):
303
+ out["replication"] = int(queries["replication"][0])
304
+ return out
.venv/lib/python3.11/site-packages/fsspec/implementations/dask.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dask
2
+ from distributed.client import Client, _get_global_client
3
+ from distributed.worker import Worker
4
+
5
+ from fsspec import filesystem
6
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
7
+ from fsspec.utils import infer_storage_options
8
+
9
+
10
+ def _get_client(client):
11
+ if client is None:
12
+ return _get_global_client()
13
+ elif isinstance(client, Client):
14
+ return client
15
+ else:
16
+ # e.g., connection string
17
+ return Client(client)
18
+
19
+
20
+ def _in_worker():
21
+ return bool(Worker._instances)
22
+
23
+
24
+ class DaskWorkerFileSystem(AbstractFileSystem):
25
+ """View files accessible to a worker as any other remote file-system
26
+
27
+ When instances are run on the worker, uses the real filesystem. When
28
+ run on the client, they call the worker to provide information or data.
29
+
30
+ **Warning** this implementation is experimental, and read-only for now.
31
+ """
32
+
33
+ def __init__(
34
+ self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
35
+ ):
36
+ super().__init__(**kwargs)
37
+ if not (fs is None) ^ (target_protocol is None):
38
+ raise ValueError(
39
+ "Please provide one of filesystem instance (fs) or"
40
+ " target_protocol, not both"
41
+ )
42
+ self.target_protocol = target_protocol
43
+ self.target_options = target_options
44
+ self.worker = None
45
+ self.client = client
46
+ self.fs = fs
47
+ self._determine_worker()
48
+
49
+ @staticmethod
50
+ def _get_kwargs_from_urls(path):
51
+ so = infer_storage_options(path)
52
+ if "host" in so and "port" in so:
53
+ return {"client": f"{so['host']}:{so['port']}"}
54
+ else:
55
+ return {}
56
+
57
+ def _determine_worker(self):
58
+ if _in_worker():
59
+ self.worker = True
60
+ if self.fs is None:
61
+ self.fs = filesystem(
62
+ self.target_protocol, **(self.target_options or {})
63
+ )
64
+ else:
65
+ self.worker = False
66
+ self.client = _get_client(self.client)
67
+ self.rfs = dask.delayed(self)
68
+
69
+ def mkdir(self, *args, **kwargs):
70
+ if self.worker:
71
+ self.fs.mkdir(*args, **kwargs)
72
+ else:
73
+ self.rfs.mkdir(*args, **kwargs).compute()
74
+
75
+ def rm(self, *args, **kwargs):
76
+ if self.worker:
77
+ self.fs.rm(*args, **kwargs)
78
+ else:
79
+ self.rfs.rm(*args, **kwargs).compute()
80
+
81
+ def copy(self, *args, **kwargs):
82
+ if self.worker:
83
+ self.fs.copy(*args, **kwargs)
84
+ else:
85
+ self.rfs.copy(*args, **kwargs).compute()
86
+
87
+ def mv(self, *args, **kwargs):
88
+ if self.worker:
89
+ self.fs.mv(*args, **kwargs)
90
+ else:
91
+ self.rfs.mv(*args, **kwargs).compute()
92
+
93
+ def ls(self, *args, **kwargs):
94
+ if self.worker:
95
+ return self.fs.ls(*args, **kwargs)
96
+ else:
97
+ return self.rfs.ls(*args, **kwargs).compute()
98
+
99
+ def _open(
100
+ self,
101
+ path,
102
+ mode="rb",
103
+ block_size=None,
104
+ autocommit=True,
105
+ cache_options=None,
106
+ **kwargs,
107
+ ):
108
+ if self.worker:
109
+ return self.fs._open(
110
+ path,
111
+ mode=mode,
112
+ block_size=block_size,
113
+ autocommit=autocommit,
114
+ cache_options=cache_options,
115
+ **kwargs,
116
+ )
117
+ else:
118
+ return DaskFile(
119
+ fs=self,
120
+ path=path,
121
+ mode=mode,
122
+ block_size=block_size,
123
+ autocommit=autocommit,
124
+ cache_options=cache_options,
125
+ **kwargs,
126
+ )
127
+
128
+ def fetch_range(self, path, mode, start, end):
129
+ if self.worker:
130
+ with self._open(path, mode) as f:
131
+ f.seek(start)
132
+ return f.read(end - start)
133
+ else:
134
+ return self.rfs.fetch_range(path, mode, start, end).compute()
135
+
136
+
137
+ class DaskFile(AbstractBufferedFile):
138
+ def __init__(self, mode="rb", **kwargs):
139
+ if mode != "rb":
140
+ raise ValueError('Remote dask files can only be opened in "rb" mode')
141
+ super().__init__(**kwargs)
142
+
143
+ def _upload_chunk(self, final=False):
144
+ pass
145
+
146
+ def _initiate_upload(self):
147
+ """Create remote file/upload"""
148
+ pass
149
+
150
+ def _fetch_range(self, start, end):
151
+ """Get the specified set of bytes from remote"""
152
+ return self.fs.fetch_range(self.path, self.mode, start, end)
.venv/lib/python3.11/site-packages/fsspec/implementations/dbfs.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import urllib
3
+
4
+ import requests
5
+ import requests.exceptions
6
+ from requests.adapters import HTTPAdapter, Retry
7
+
8
+ from fsspec import AbstractFileSystem
9
+ from fsspec.spec import AbstractBufferedFile
10
+
11
+
12
+ class DatabricksException(Exception):
13
+ """
14
+ Helper class for exceptions raised in this module.
15
+ """
16
+
17
+ def __init__(self, error_code, message):
18
+ """Create a new DatabricksException"""
19
+ super().__init__(message)
20
+
21
+ self.error_code = error_code
22
+ self.message = message
23
+
24
+
25
+ class DatabricksFileSystem(AbstractFileSystem):
26
+ """
27
+ Get access to the Databricks filesystem implementation over HTTP.
28
+ Can be used inside and outside of a databricks cluster.
29
+ """
30
+
31
+ def __init__(self, instance, token, **kwargs):
32
+ """
33
+ Create a new DatabricksFileSystem.
34
+
35
+ Parameters
36
+ ----------
37
+ instance: str
38
+ The instance URL of the databricks cluster.
39
+ For example for an Azure databricks cluster, this
40
+ has the form adb-<some-number>.<two digits>.azuredatabricks.net.
41
+ token: str
42
+ Your personal token. Find out more
43
+ here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
44
+ """
45
+ self.instance = instance
46
+ self.token = token
47
+ self.session = requests.Session()
48
+ self.retries = Retry(
49
+ total=10,
50
+ backoff_factor=0.05,
51
+ status_forcelist=[408, 429, 500, 502, 503, 504],
52
+ )
53
+
54
+ self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
55
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
56
+
57
+ super().__init__(**kwargs)
58
+
59
+ def ls(self, path, detail=True, **kwargs):
60
+ """
61
+ List the contents of the given path.
62
+
63
+ Parameters
64
+ ----------
65
+ path: str
66
+ Absolute path
67
+ detail: bool
68
+ Return not only the list of filenames,
69
+ but also additional information on file sizes
70
+ and types.
71
+ """
72
+ out = self._ls_from_cache(path)
73
+ if not out:
74
+ try:
75
+ r = self._send_to_api(
76
+ method="get", endpoint="list", json={"path": path}
77
+ )
78
+ except DatabricksException as e:
79
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
80
+ raise FileNotFoundError(e.message) from e
81
+
82
+ raise
83
+ files = r["files"]
84
+ out = [
85
+ {
86
+ "name": o["path"],
87
+ "type": "directory" if o["is_dir"] else "file",
88
+ "size": o["file_size"],
89
+ }
90
+ for o in files
91
+ ]
92
+ self.dircache[path] = out
93
+
94
+ if detail:
95
+ return out
96
+ return [o["name"] for o in out]
97
+
98
+ def makedirs(self, path, exist_ok=True):
99
+ """
100
+ Create a given absolute path and all of its parents.
101
+
102
+ Parameters
103
+ ----------
104
+ path: str
105
+ Absolute path to create
106
+ exist_ok: bool
107
+ If false, checks if the folder
108
+ exists before creating it (and raises an
109
+ Exception if this is the case)
110
+ """
111
+ if not exist_ok:
112
+ try:
113
+ # If the following succeeds, the path is already present
114
+ self._send_to_api(
115
+ method="get", endpoint="get-status", json={"path": path}
116
+ )
117
+ raise FileExistsError(f"Path {path} already exists")
118
+ except DatabricksException as e:
119
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
120
+ pass
121
+
122
+ try:
123
+ self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
124
+ except DatabricksException as e:
125
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
126
+ raise FileExistsError(e.message) from e
127
+
128
+ raise
129
+ self.invalidate_cache(self._parent(path))
130
+
131
+ def mkdir(self, path, create_parents=True, **kwargs):
132
+ """
133
+ Create a given absolute path and all of its parents.
134
+
135
+ Parameters
136
+ ----------
137
+ path: str
138
+ Absolute path to create
139
+ create_parents: bool
140
+ Whether to create all parents or not.
141
+ "False" is not implemented so far.
142
+ """
143
+ if not create_parents:
144
+ raise NotImplementedError
145
+
146
+ self.mkdirs(path, **kwargs)
147
+
148
+ def rm(self, path, recursive=False, **kwargs):
149
+ """
150
+ Remove the file or folder at the given absolute path.
151
+
152
+ Parameters
153
+ ----------
154
+ path: str
155
+ Absolute path what to remove
156
+ recursive: bool
157
+ Recursively delete all files in a folder.
158
+ """
159
+ try:
160
+ self._send_to_api(
161
+ method="post",
162
+ endpoint="delete",
163
+ json={"path": path, "recursive": recursive},
164
+ )
165
+ except DatabricksException as e:
166
+ # This is not really an exception, it just means
167
+ # not everything was deleted so far
168
+ if e.error_code == "PARTIAL_DELETE":
169
+ self.rm(path=path, recursive=recursive)
170
+ elif e.error_code == "IO_ERROR":
171
+ # Using the same exception as the os module would use here
172
+ raise OSError(e.message) from e
173
+
174
+ raise
175
+ self.invalidate_cache(self._parent(path))
176
+
177
+ def mv(
178
+ self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
179
+ ):
180
+ """
181
+ Move a source to a destination path.
182
+
183
+ A note from the original [databricks API manual]
184
+ (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
185
+
186
+ When moving a large number of files the API call will time out after
187
+ approximately 60s, potentially resulting in partially moved data.
188
+ Therefore, for operations that move more than 10k files, we strongly
189
+ discourage using the DBFS REST API.
190
+
191
+ Parameters
192
+ ----------
193
+ source_path: str
194
+ From where to move (absolute path)
195
+ destination_path: str
196
+ To where to move (absolute path)
197
+ recursive: bool
198
+ Not implemented to far.
199
+ maxdepth:
200
+ Not implemented to far.
201
+ """
202
+ if recursive:
203
+ raise NotImplementedError
204
+ if maxdepth:
205
+ raise NotImplementedError
206
+
207
+ try:
208
+ self._send_to_api(
209
+ method="post",
210
+ endpoint="move",
211
+ json={"source_path": source_path, "destination_path": destination_path},
212
+ )
213
+ except DatabricksException as e:
214
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
215
+ raise FileNotFoundError(e.message) from e
216
+ elif e.error_code == "RESOURCE_ALREADY_EXISTS":
217
+ raise FileExistsError(e.message) from e
218
+
219
+ raise
220
+ self.invalidate_cache(self._parent(source_path))
221
+ self.invalidate_cache(self._parent(destination_path))
222
+
223
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
224
+ """
225
+ Overwrite the base class method to make sure to create a DBFile.
226
+ All arguments are copied from the base method.
227
+
228
+ Only the default blocksize is allowed.
229
+ """
230
+ return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
231
+
232
+ def _send_to_api(self, method, endpoint, json):
233
+ """
234
+ Send the given json to the DBFS API
235
+ using a get or post request (specified by the argument `method`).
236
+
237
+ Parameters
238
+ ----------
239
+ method: str
240
+ Which http method to use for communication; "get" or "post".
241
+ endpoint: str
242
+ Where to send the request to (last part of the API URL)
243
+ json: dict
244
+ Dictionary of information to send
245
+ """
246
+ if method == "post":
247
+ session_call = self.session.post
248
+ elif method == "get":
249
+ session_call = self.session.get
250
+ else:
251
+ raise ValueError(f"Do not understand method {method}")
252
+
253
+ url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
254
+
255
+ r = session_call(url, json=json)
256
+
257
+ # The DBFS API will return a json, also in case of an exception.
258
+ # We want to preserve this information as good as possible.
259
+ try:
260
+ r.raise_for_status()
261
+ except requests.HTTPError as e:
262
+ # try to extract json error message
263
+ # if that fails, fall back to the original exception
264
+ try:
265
+ exception_json = e.response.json()
266
+ except Exception:
267
+ raise e from None
268
+
269
+ raise DatabricksException(**exception_json) from e
270
+
271
+ return r.json()
272
+
273
+ def _create_handle(self, path, overwrite=True):
274
+ """
275
+ Internal function to create a handle, which can be used to
276
+ write blocks of a file to DBFS.
277
+ A handle has a unique identifier which needs to be passed
278
+ whenever written during this transaction.
279
+ The handle is active for 10 minutes - after that a new
280
+ write transaction needs to be created.
281
+ Make sure to close the handle after you are finished.
282
+
283
+ Parameters
284
+ ----------
285
+ path: str
286
+ Absolute path for this file.
287
+ overwrite: bool
288
+ If a file already exist at this location, either overwrite
289
+ it or raise an exception.
290
+ """
291
+ try:
292
+ r = self._send_to_api(
293
+ method="post",
294
+ endpoint="create",
295
+ json={"path": path, "overwrite": overwrite},
296
+ )
297
+ return r["handle"]
298
+ except DatabricksException as e:
299
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
300
+ raise FileExistsError(e.message) from e
301
+
302
+ raise
303
+
304
+ def _close_handle(self, handle):
305
+ """
306
+ Close a handle, which was opened by :func:`_create_handle`.
307
+
308
+ Parameters
309
+ ----------
310
+ handle: str
311
+ Which handle to close.
312
+ """
313
+ try:
314
+ self._send_to_api(method="post", endpoint="close", json={"handle": handle})
315
+ except DatabricksException as e:
316
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
317
+ raise FileNotFoundError(e.message) from e
318
+
319
+ raise
320
+
321
+ def _add_data(self, handle, data):
322
+ """
323
+ Upload data to an already opened file handle
324
+ (opened by :func:`_create_handle`).
325
+ The maximal allowed data size is 1MB after
326
+ conversion to base64.
327
+ Remember to close the handle when you are finished.
328
+
329
+ Parameters
330
+ ----------
331
+ handle: str
332
+ Which handle to upload data to.
333
+ data: bytes
334
+ Block of data to add to the handle.
335
+ """
336
+ data = base64.b64encode(data).decode()
337
+ try:
338
+ self._send_to_api(
339
+ method="post",
340
+ endpoint="add-block",
341
+ json={"handle": handle, "data": data},
342
+ )
343
+ except DatabricksException as e:
344
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
345
+ raise FileNotFoundError(e.message) from e
346
+ elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
347
+ raise ValueError(e.message) from e
348
+
349
+ raise
350
+
351
+ def _get_data(self, path, start, end):
352
+ """
353
+ Download data in bytes from a given absolute path in a block
354
+ from [start, start+length].
355
+ The maximum number of allowed bytes to read is 1MB.
356
+
357
+ Parameters
358
+ ----------
359
+ path: str
360
+ Absolute path to download data from
361
+ start: int
362
+ Start position of the block
363
+ end: int
364
+ End position of the block
365
+ """
366
+ try:
367
+ r = self._send_to_api(
368
+ method="get",
369
+ endpoint="read",
370
+ json={"path": path, "offset": start, "length": end - start},
371
+ )
372
+ return base64.b64decode(r["data"])
373
+ except DatabricksException as e:
374
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
375
+ raise FileNotFoundError(e.message) from e
376
+ elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
377
+ raise ValueError(e.message) from e
378
+
379
+ raise
380
+
381
+ def invalidate_cache(self, path=None):
382
+ if path is None:
383
+ self.dircache.clear()
384
+ else:
385
+ self.dircache.pop(path, None)
386
+ super().invalidate_cache(path)
387
+
388
+
389
+ class DatabricksFile(AbstractBufferedFile):
390
+ """
391
+ Helper class for files referenced in the DatabricksFileSystem.
392
+ """
393
+
394
+ DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
395
+
396
+ def __init__(
397
+ self,
398
+ fs,
399
+ path,
400
+ mode="rb",
401
+ block_size="default",
402
+ autocommit=True,
403
+ cache_type="readahead",
404
+ cache_options=None,
405
+ **kwargs,
406
+ ):
407
+ """
408
+ Create a new instance of the DatabricksFile.
409
+
410
+ The blocksize needs to be the default one.
411
+ """
412
+ if block_size is None or block_size == "default":
413
+ block_size = self.DEFAULT_BLOCK_SIZE
414
+
415
+ assert block_size == self.DEFAULT_BLOCK_SIZE, (
416
+ f"Only the default block size is allowed, not {block_size}"
417
+ )
418
+
419
+ super().__init__(
420
+ fs,
421
+ path,
422
+ mode=mode,
423
+ block_size=block_size,
424
+ autocommit=autocommit,
425
+ cache_type=cache_type,
426
+ cache_options=cache_options or {},
427
+ **kwargs,
428
+ )
429
+
430
+ def _initiate_upload(self):
431
+ """Internal function to start a file upload"""
432
+ self.handle = self.fs._create_handle(self.path)
433
+
434
+ def _upload_chunk(self, final=False):
435
+ """Internal function to add a chunk of data to a started upload"""
436
+ self.buffer.seek(0)
437
+ data = self.buffer.getvalue()
438
+
439
+ data_chunks = [
440
+ data[start:end] for start, end in self._to_sized_blocks(len(data))
441
+ ]
442
+
443
+ for data_chunk in data_chunks:
444
+ self.fs._add_data(handle=self.handle, data=data_chunk)
445
+
446
+ if final:
447
+ self.fs._close_handle(handle=self.handle)
448
+ return True
449
+
450
+ def _fetch_range(self, start, end):
451
+ """Internal function to download a block of data"""
452
+ return_buffer = b""
453
+ length = end - start
454
+ for chunk_start, chunk_end in self._to_sized_blocks(length, start):
455
+ return_buffer += self.fs._get_data(
456
+ path=self.path, start=chunk_start, end=chunk_end
457
+ )
458
+
459
+ return return_buffer
460
+
461
+ def _to_sized_blocks(self, length, start=0):
462
+ """Helper function to split a range from 0 to total_length into bloksizes"""
463
+ end = start + length
464
+ for data_chunk in range(start, end, self.blocksize):
465
+ data_start = data_chunk
466
+ data_end = min(end, data_chunk + self.blocksize)
467
+ yield data_start, data_end
.venv/lib/python3.11/site-packages/fsspec/implementations/dirfs.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import filesystem
2
+ from ..asyn import AsyncFileSystem
3
+
4
+
5
+ class DirFileSystem(AsyncFileSystem):
6
+ """Directory prefix filesystem
7
+
8
+ The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
9
+ is relative to the `path`. After performing the necessary paths operation it
10
+ delegates everything to the wrapped filesystem.
11
+ """
12
+
13
+ protocol = "dir"
14
+
15
+ def __init__(
16
+ self,
17
+ path=None,
18
+ fs=None,
19
+ fo=None,
20
+ target_protocol=None,
21
+ target_options=None,
22
+ **storage_options,
23
+ ):
24
+ """
25
+ Parameters
26
+ ----------
27
+ path: str
28
+ Path to the directory.
29
+ fs: AbstractFileSystem
30
+ An instantiated filesystem to wrap.
31
+ target_protocol, target_options:
32
+ if fs is none, construct it from these
33
+ fo: str
34
+ Alternate for path; do not provide both
35
+ """
36
+ super().__init__(**storage_options)
37
+ if fs is None:
38
+ fs = filesystem(protocol=target_protocol, **(target_options or {}))
39
+ if (path is not None) ^ (fo is not None) is False:
40
+ raise ValueError("Provide path or fo, not both")
41
+ path = path or fo
42
+
43
+ if self.asynchronous and not fs.async_impl:
44
+ raise ValueError("can't use asynchronous with non-async fs")
45
+
46
+ if fs.async_impl and self.asynchronous != fs.asynchronous:
47
+ raise ValueError("both dirfs and fs should be in the same sync/async mode")
48
+
49
+ self.path = fs._strip_protocol(path)
50
+ self.fs = fs
51
+
52
+ def _join(self, path):
53
+ if isinstance(path, str):
54
+ if not self.path:
55
+ return path
56
+ if not path:
57
+ return self.path
58
+ return self.fs.sep.join((self.path, self._strip_protocol(path)))
59
+ if isinstance(path, dict):
60
+ return {self._join(_path): value for _path, value in path.items()}
61
+ return [self._join(_path) for _path in path]
62
+
63
+ def _relpath(self, path):
64
+ if isinstance(path, str):
65
+ if not self.path:
66
+ return path
67
+ # We need to account for S3FileSystem returning paths that do not
68
+ # start with a '/'
69
+ if path == self.path or (
70
+ self.path.startswith(self.fs.sep) and path == self.path[1:]
71
+ ):
72
+ return ""
73
+ prefix = self.path + self.fs.sep
74
+ if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
75
+ prefix = prefix[1:]
76
+ assert path.startswith(prefix)
77
+ return path[len(prefix) :]
78
+ return [self._relpath(_path) for _path in path]
79
+
80
+ # Wrappers below
81
+
82
+ @property
83
+ def sep(self):
84
+ return self.fs.sep
85
+
86
+ async def set_session(self, *args, **kwargs):
87
+ return await self.fs.set_session(*args, **kwargs)
88
+
89
+ async def _rm_file(self, path, **kwargs):
90
+ return await self.fs._rm_file(self._join(path), **kwargs)
91
+
92
+ def rm_file(self, path, **kwargs):
93
+ return self.fs.rm_file(self._join(path), **kwargs)
94
+
95
+ async def _rm(self, path, *args, **kwargs):
96
+ return await self.fs._rm(self._join(path), *args, **kwargs)
97
+
98
+ def rm(self, path, *args, **kwargs):
99
+ return self.fs.rm(self._join(path), *args, **kwargs)
100
+
101
+ async def _cp_file(self, path1, path2, **kwargs):
102
+ return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
103
+
104
+ def cp_file(self, path1, path2, **kwargs):
105
+ return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
106
+
107
+ async def _copy(
108
+ self,
109
+ path1,
110
+ path2,
111
+ *args,
112
+ **kwargs,
113
+ ):
114
+ return await self.fs._copy(
115
+ self._join(path1),
116
+ self._join(path2),
117
+ *args,
118
+ **kwargs,
119
+ )
120
+
121
+ def copy(self, path1, path2, *args, **kwargs):
122
+ return self.fs.copy(
123
+ self._join(path1),
124
+ self._join(path2),
125
+ *args,
126
+ **kwargs,
127
+ )
128
+
129
+ async def _pipe(self, path, *args, **kwargs):
130
+ return await self.fs._pipe(self._join(path), *args, **kwargs)
131
+
132
+ def pipe(self, path, *args, **kwargs):
133
+ return self.fs.pipe(self._join(path), *args, **kwargs)
134
+
135
+ async def _pipe_file(self, path, *args, **kwargs):
136
+ return await self.fs._pipe_file(self._join(path), *args, **kwargs)
137
+
138
+ def pipe_file(self, path, *args, **kwargs):
139
+ return self.fs.pipe_file(self._join(path), *args, **kwargs)
140
+
141
+ async def _cat_file(self, path, *args, **kwargs):
142
+ return await self.fs._cat_file(self._join(path), *args, **kwargs)
143
+
144
+ def cat_file(self, path, *args, **kwargs):
145
+ return self.fs.cat_file(self._join(path), *args, **kwargs)
146
+
147
+ async def _cat(self, path, *args, **kwargs):
148
+ ret = await self.fs._cat(
149
+ self._join(path),
150
+ *args,
151
+ **kwargs,
152
+ )
153
+
154
+ if isinstance(ret, dict):
155
+ return {self._relpath(key): value for key, value in ret.items()}
156
+
157
+ return ret
158
+
159
+ def cat(self, path, *args, **kwargs):
160
+ ret = self.fs.cat(
161
+ self._join(path),
162
+ *args,
163
+ **kwargs,
164
+ )
165
+
166
+ if isinstance(ret, dict):
167
+ return {self._relpath(key): value for key, value in ret.items()}
168
+
169
+ return ret
170
+
171
+ async def _put_file(self, lpath, rpath, **kwargs):
172
+ return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
173
+
174
+ def put_file(self, lpath, rpath, **kwargs):
175
+ return self.fs.put_file(lpath, self._join(rpath), **kwargs)
176
+
177
+ async def _put(
178
+ self,
179
+ lpath,
180
+ rpath,
181
+ *args,
182
+ **kwargs,
183
+ ):
184
+ return await self.fs._put(
185
+ lpath,
186
+ self._join(rpath),
187
+ *args,
188
+ **kwargs,
189
+ )
190
+
191
+ def put(self, lpath, rpath, *args, **kwargs):
192
+ return self.fs.put(
193
+ lpath,
194
+ self._join(rpath),
195
+ *args,
196
+ **kwargs,
197
+ )
198
+
199
+ async def _get_file(self, rpath, lpath, **kwargs):
200
+ return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
201
+
202
+ def get_file(self, rpath, lpath, **kwargs):
203
+ return self.fs.get_file(self._join(rpath), lpath, **kwargs)
204
+
205
+ async def _get(self, rpath, *args, **kwargs):
206
+ return await self.fs._get(self._join(rpath), *args, **kwargs)
207
+
208
+ def get(self, rpath, *args, **kwargs):
209
+ return self.fs.get(self._join(rpath), *args, **kwargs)
210
+
211
+ async def _isfile(self, path):
212
+ return await self.fs._isfile(self._join(path))
213
+
214
+ def isfile(self, path):
215
+ return self.fs.isfile(self._join(path))
216
+
217
+ async def _isdir(self, path):
218
+ return await self.fs._isdir(self._join(path))
219
+
220
+ def isdir(self, path):
221
+ return self.fs.isdir(self._join(path))
222
+
223
+ async def _size(self, path):
224
+ return await self.fs._size(self._join(path))
225
+
226
+ def size(self, path):
227
+ return self.fs.size(self._join(path))
228
+
229
+ async def _exists(self, path):
230
+ return await self.fs._exists(self._join(path))
231
+
232
+ def exists(self, path):
233
+ return self.fs.exists(self._join(path))
234
+
235
+ async def _info(self, path, **kwargs):
236
+ return await self.fs._info(self._join(path), **kwargs)
237
+
238
+ def info(self, path, **kwargs):
239
+ return self.fs.info(self._join(path), **kwargs)
240
+
241
+ async def _ls(self, path, detail=True, **kwargs):
242
+ ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
243
+ if detail:
244
+ out = []
245
+ for entry in ret:
246
+ entry = entry.copy()
247
+ entry["name"] = self._relpath(entry["name"])
248
+ out.append(entry)
249
+ return out
250
+
251
+ return self._relpath(ret)
252
+
253
+ def ls(self, path, detail=True, **kwargs):
254
+ ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
255
+ if detail:
256
+ out = []
257
+ for entry in ret:
258
+ entry = entry.copy()
259
+ entry["name"] = self._relpath(entry["name"])
260
+ out.append(entry)
261
+ return out
262
+
263
+ return self._relpath(ret)
264
+
265
+ async def _walk(self, path, *args, **kwargs):
266
+ async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
267
+ yield self._relpath(root), dirs, files
268
+
269
+ def walk(self, path, *args, **kwargs):
270
+ for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
271
+ yield self._relpath(root), dirs, files
272
+
273
+ async def _glob(self, path, **kwargs):
274
+ detail = kwargs.get("detail", False)
275
+ ret = await self.fs._glob(self._join(path), **kwargs)
276
+ if detail:
277
+ return {self._relpath(path): info for path, info in ret.items()}
278
+ return self._relpath(ret)
279
+
280
+ def glob(self, path, **kwargs):
281
+ detail = kwargs.get("detail", False)
282
+ ret = self.fs.glob(self._join(path), **kwargs)
283
+ if detail:
284
+ return {self._relpath(path): info for path, info in ret.items()}
285
+ return self._relpath(ret)
286
+
287
+ async def _du(self, path, *args, **kwargs):
288
+ total = kwargs.get("total", True)
289
+ ret = await self.fs._du(self._join(path), *args, **kwargs)
290
+ if total:
291
+ return ret
292
+
293
+ return {self._relpath(path): size for path, size in ret.items()}
294
+
295
+ def du(self, path, *args, **kwargs):
296
+ total = kwargs.get("total", True)
297
+ ret = self.fs.du(self._join(path), *args, **kwargs)
298
+ if total:
299
+ return ret
300
+
301
+ return {self._relpath(path): size for path, size in ret.items()}
302
+
303
+ async def _find(self, path, *args, **kwargs):
304
+ detail = kwargs.get("detail", False)
305
+ ret = await self.fs._find(self._join(path), *args, **kwargs)
306
+ if detail:
307
+ return {self._relpath(path): info for path, info in ret.items()}
308
+ return self._relpath(ret)
309
+
310
+ def find(self, path, *args, **kwargs):
311
+ detail = kwargs.get("detail", False)
312
+ ret = self.fs.find(self._join(path), *args, **kwargs)
313
+ if detail:
314
+ return {self._relpath(path): info for path, info in ret.items()}
315
+ return self._relpath(ret)
316
+
317
+ async def _expand_path(self, path, *args, **kwargs):
318
+ return self._relpath(
319
+ await self.fs._expand_path(self._join(path), *args, **kwargs)
320
+ )
321
+
322
+ def expand_path(self, path, *args, **kwargs):
323
+ return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
324
+
325
+ async def _mkdir(self, path, *args, **kwargs):
326
+ return await self.fs._mkdir(self._join(path), *args, **kwargs)
327
+
328
+ def mkdir(self, path, *args, **kwargs):
329
+ return self.fs.mkdir(self._join(path), *args, **kwargs)
330
+
331
+ async def _makedirs(self, path, *args, **kwargs):
332
+ return await self.fs._makedirs(self._join(path), *args, **kwargs)
333
+
334
+ def makedirs(self, path, *args, **kwargs):
335
+ return self.fs.makedirs(self._join(path), *args, **kwargs)
336
+
337
+ def rmdir(self, path):
338
+ return self.fs.rmdir(self._join(path))
339
+
340
+ def mv(self, path1, path2, **kwargs):
341
+ return self.fs.mv(
342
+ self._join(path1),
343
+ self._join(path2),
344
+ **kwargs,
345
+ )
346
+
347
+ def touch(self, path, **kwargs):
348
+ return self.fs.touch(self._join(path), **kwargs)
349
+
350
+ def created(self, path):
351
+ return self.fs.created(self._join(path))
352
+
353
+ def modified(self, path):
354
+ return self.fs.modified(self._join(path))
355
+
356
+ def sign(self, path, *args, **kwargs):
357
+ return self.fs.sign(self._join(path), *args, **kwargs)
358
+
359
+ def __repr__(self):
360
+ return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
361
+
362
+ def open(
363
+ self,
364
+ path,
365
+ *args,
366
+ **kwargs,
367
+ ):
368
+ return self.fs.open(
369
+ self._join(path),
370
+ *args,
371
+ **kwargs,
372
+ )
373
+
374
+ async def open_async(
375
+ self,
376
+ path,
377
+ *args,
378
+ **kwargs,
379
+ ):
380
+ return await self.fs.open_async(
381
+ self._join(path),
382
+ *args,
383
+ **kwargs,
384
+ )
.venv/lib/python3.11/site-packages/fsspec/implementations/jupyter.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import re
4
+
5
+ import requests
6
+
7
+ import fsspec
8
+
9
+
10
+ class JupyterFileSystem(fsspec.AbstractFileSystem):
11
+ """View of the files as seen by a Jupyter server (notebook or lab)"""
12
+
13
+ protocol = ("jupyter", "jlab")
14
+
15
+ def __init__(self, url, tok=None, **kwargs):
16
+ """
17
+
18
+ Parameters
19
+ ----------
20
+ url : str
21
+ Base URL of the server, like "http://127.0.0.1:8888". May include
22
+ token in the string, which is given by the process when starting up
23
+ tok : str
24
+ If the token is obtained separately, can be given here
25
+ kwargs
26
+ """
27
+ if "?" in url:
28
+ if tok is None:
29
+ try:
30
+ tok = re.findall("token=([a-z0-9]+)", url)[0]
31
+ except IndexError as e:
32
+ raise ValueError("Could not determine token") from e
33
+ url = url.split("?", 1)[0]
34
+ self.url = url.rstrip("/") + "/api/contents"
35
+ self.session = requests.Session()
36
+ if tok:
37
+ self.session.headers["Authorization"] = f"token {tok}"
38
+
39
+ super().__init__(**kwargs)
40
+
41
+ def ls(self, path, detail=True, **kwargs):
42
+ path = self._strip_protocol(path)
43
+ r = self.session.get(f"{self.url}/{path}")
44
+ if r.status_code == 404:
45
+ return FileNotFoundError(path)
46
+ r.raise_for_status()
47
+ out = r.json()
48
+
49
+ if out["type"] == "directory":
50
+ out = out["content"]
51
+ else:
52
+ out = [out]
53
+ for o in out:
54
+ o["name"] = o.pop("path")
55
+ o.pop("content")
56
+ if o["type"] == "notebook":
57
+ o["type"] = "file"
58
+ if detail:
59
+ return out
60
+ return [o["name"] for o in out]
61
+
62
+ def cat_file(self, path, start=None, end=None, **kwargs):
63
+ path = self._strip_protocol(path)
64
+ r = self.session.get(f"{self.url}/{path}")
65
+ if r.status_code == 404:
66
+ return FileNotFoundError(path)
67
+ r.raise_for_status()
68
+ out = r.json()
69
+ if out["format"] == "text":
70
+ # data should be binary
71
+ b = out["content"].encode()
72
+ else:
73
+ b = base64.b64decode(out["content"])
74
+ return b[start:end]
75
+
76
+ def pipe_file(self, path, value, **_):
77
+ path = self._strip_protocol(path)
78
+ json = {
79
+ "name": path.rsplit("/", 1)[-1],
80
+ "path": path,
81
+ "size": len(value),
82
+ "content": base64.b64encode(value).decode(),
83
+ "format": "base64",
84
+ "type": "file",
85
+ }
86
+ self.session.put(f"{self.url}/{path}", json=json)
87
+
88
+ def mkdir(self, path, create_parents=True, **kwargs):
89
+ path = self._strip_protocol(path)
90
+ if create_parents and "/" in path:
91
+ self.mkdir(path.rsplit("/", 1)[0], True)
92
+ json = {
93
+ "name": path.rsplit("/", 1)[-1],
94
+ "path": path,
95
+ "size": None,
96
+ "content": None,
97
+ "type": "directory",
98
+ }
99
+ self.session.put(f"{self.url}/{path}", json=json)
100
+
101
+ def _rm(self, path):
102
+ path = self._strip_protocol(path)
103
+ self.session.delete(f"{self.url}/{path}")
104
+
105
+ def _open(self, path, mode="rb", **kwargs):
106
+ path = self._strip_protocol(path)
107
+ if mode == "rb":
108
+ data = self.cat_file(path)
109
+ return io.BytesIO(data)
110
+ else:
111
+ return SimpleFileWriter(self, path, mode="wb")
112
+
113
+
114
+ class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
115
+ def _upload_chunk(self, final=False):
116
+ """Never uploads a chunk until file is done
117
+
118
+ Not suitable for large files
119
+ """
120
+ if final is False:
121
+ return False
122
+ self.buffer.seek(0)
123
+ data = self.buffer.read()
124
+ self.fs.pipe_file(self.path, data)
.venv/lib/python3.11/site-packages/fsspec/implementations/local.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import io
3
+ import logging
4
+ import os
5
+ import os.path as osp
6
+ import shutil
7
+ import stat
8
+ import tempfile
9
+
10
+ from fsspec import AbstractFileSystem
11
+ from fsspec.compression import compr
12
+ from fsspec.core import get_compression
13
+ from fsspec.utils import isfilelike, stringify_path
14
+
15
+ logger = logging.getLogger("fsspec.local")
16
+
17
+
18
+ class LocalFileSystem(AbstractFileSystem):
19
+ """Interface to files on local storage
20
+
21
+ Parameters
22
+ ----------
23
+ auto_mkdir: bool
24
+ Whether, when opening a file, the directory containing it should
25
+ be created (if it doesn't already exist). This is assumed by pyarrow
26
+ code.
27
+ """
28
+
29
+ root_marker = "/"
30
+ protocol = "file", "local"
31
+ local_file = True
32
+
33
+ def __init__(self, auto_mkdir=False, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.auto_mkdir = auto_mkdir
36
+
37
+ @property
38
+ def fsid(self):
39
+ return "local"
40
+
41
+ def mkdir(self, path, create_parents=True, **kwargs):
42
+ path = self._strip_protocol(path)
43
+ if self.exists(path):
44
+ raise FileExistsError(path)
45
+ if create_parents:
46
+ self.makedirs(path, exist_ok=True)
47
+ else:
48
+ os.mkdir(path, **kwargs)
49
+
50
+ def makedirs(self, path, exist_ok=False):
51
+ path = self._strip_protocol(path)
52
+ os.makedirs(path, exist_ok=exist_ok)
53
+
54
+ def rmdir(self, path):
55
+ path = self._strip_protocol(path)
56
+ os.rmdir(path)
57
+
58
+ def ls(self, path, detail=False, **kwargs):
59
+ path = self._strip_protocol(path)
60
+ info = self.info(path)
61
+ if info["type"] == "directory":
62
+ with os.scandir(path) as it:
63
+ infos = []
64
+ for f in it:
65
+ try:
66
+ infos.append(self.info(f))
67
+ except FileNotFoundError:
68
+ pass
69
+ else:
70
+ infos = [info]
71
+
72
+ if not detail:
73
+ return [i["name"] for i in infos]
74
+ return infos
75
+
76
+ def info(self, path, **kwargs):
77
+ if isinstance(path, os.DirEntry):
78
+ # scandir DirEntry
79
+ out = path.stat(follow_symlinks=False)
80
+ link = path.is_symlink()
81
+ if path.is_dir(follow_symlinks=False):
82
+ t = "directory"
83
+ elif path.is_file(follow_symlinks=False):
84
+ t = "file"
85
+ else:
86
+ t = "other"
87
+
88
+ size = out.st_size
89
+ if link:
90
+ try:
91
+ out2 = path.stat(follow_symlinks=True)
92
+ size = out2.st_size
93
+ except OSError:
94
+ size = 0
95
+ path = self._strip_protocol(path.path)
96
+ else:
97
+ # str or path-like
98
+ path = self._strip_protocol(path)
99
+ out = os.stat(path, follow_symlinks=False)
100
+ link = stat.S_ISLNK(out.st_mode)
101
+ if link:
102
+ out = os.stat(path, follow_symlinks=True)
103
+ size = out.st_size
104
+ if stat.S_ISDIR(out.st_mode):
105
+ t = "directory"
106
+ elif stat.S_ISREG(out.st_mode):
107
+ t = "file"
108
+ else:
109
+ t = "other"
110
+ result = {
111
+ "name": path,
112
+ "size": size,
113
+ "type": t,
114
+ "created": out.st_ctime,
115
+ "islink": link,
116
+ }
117
+ for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
118
+ result[field] = getattr(out, f"st_{field}")
119
+ if link:
120
+ result["destination"] = os.readlink(path)
121
+ return result
122
+
123
+ def lexists(self, path, **kwargs):
124
+ return osp.lexists(path)
125
+
126
+ def cp_file(self, path1, path2, **kwargs):
127
+ path1 = self._strip_protocol(path1)
128
+ path2 = self._strip_protocol(path2)
129
+ if self.auto_mkdir:
130
+ self.makedirs(self._parent(path2), exist_ok=True)
131
+ if self.isfile(path1):
132
+ shutil.copyfile(path1, path2)
133
+ elif self.isdir(path1):
134
+ self.mkdirs(path2, exist_ok=True)
135
+ else:
136
+ raise FileNotFoundError(path1)
137
+
138
+ def isfile(self, path):
139
+ path = self._strip_protocol(path)
140
+ return os.path.isfile(path)
141
+
142
+ def isdir(self, path):
143
+ path = self._strip_protocol(path)
144
+ return os.path.isdir(path)
145
+
146
+ def get_file(self, path1, path2, callback=None, **kwargs):
147
+ if isfilelike(path2):
148
+ with open(path1, "rb") as f:
149
+ shutil.copyfileobj(f, path2)
150
+ else:
151
+ return self.cp_file(path1, path2, **kwargs)
152
+
153
+ def put_file(self, path1, path2, callback=None, **kwargs):
154
+ return self.cp_file(path1, path2, **kwargs)
155
+
156
+ def mv(self, path1, path2, **kwargs):
157
+ path1 = self._strip_protocol(path1)
158
+ path2 = self._strip_protocol(path2)
159
+ shutil.move(path1, path2)
160
+
161
+ def link(self, src, dst, **kwargs):
162
+ src = self._strip_protocol(src)
163
+ dst = self._strip_protocol(dst)
164
+ os.link(src, dst, **kwargs)
165
+
166
+ def symlink(self, src, dst, **kwargs):
167
+ src = self._strip_protocol(src)
168
+ dst = self._strip_protocol(dst)
169
+ os.symlink(src, dst, **kwargs)
170
+
171
+ def islink(self, path) -> bool:
172
+ return os.path.islink(self._strip_protocol(path))
173
+
174
+ def rm_file(self, path):
175
+ os.remove(self._strip_protocol(path))
176
+
177
+ def rm(self, path, recursive=False, maxdepth=None):
178
+ if not isinstance(path, list):
179
+ path = [path]
180
+
181
+ for p in path:
182
+ p = self._strip_protocol(p)
183
+ if self.isdir(p):
184
+ if not recursive:
185
+ raise ValueError("Cannot delete directory, set recursive=True")
186
+ if osp.abspath(p) == os.getcwd():
187
+ raise ValueError("Cannot delete current working directory")
188
+ shutil.rmtree(p)
189
+ else:
190
+ os.remove(p)
191
+
192
+ def unstrip_protocol(self, name):
193
+ name = self._strip_protocol(name) # normalise for local/win/...
194
+ return f"file://{name}"
195
+
196
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
197
+ path = self._strip_protocol(path)
198
+ if self.auto_mkdir and "w" in mode:
199
+ self.makedirs(self._parent(path), exist_ok=True)
200
+ return LocalFileOpener(path, mode, fs=self, **kwargs)
201
+
202
+ def touch(self, path, truncate=True, **kwargs):
203
+ path = self._strip_protocol(path)
204
+ if self.auto_mkdir:
205
+ self.makedirs(self._parent(path), exist_ok=True)
206
+ if self.exists(path):
207
+ os.utime(path, None)
208
+ else:
209
+ open(path, "a").close()
210
+ if truncate:
211
+ os.truncate(path, 0)
212
+
213
+ def created(self, path):
214
+ info = self.info(path=path)
215
+ return datetime.datetime.fromtimestamp(
216
+ info["created"], tz=datetime.timezone.utc
217
+ )
218
+
219
+ def modified(self, path):
220
+ info = self.info(path=path)
221
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
222
+
223
+ @classmethod
224
+ def _parent(cls, path):
225
+ path = cls._strip_protocol(path)
226
+ if os.sep == "/":
227
+ # posix native
228
+ return path.rsplit("/", 1)[0] or "/"
229
+ else:
230
+ # NT
231
+ path_ = path.rsplit("/", 1)[0]
232
+ if len(path_) <= 3:
233
+ if path_[1:2] == ":":
234
+ # nt root (something like c:/)
235
+ return path_[0] + ":/"
236
+ # More cases may be required here
237
+ return path_
238
+
239
+ @classmethod
240
+ def _strip_protocol(cls, path):
241
+ path = stringify_path(path)
242
+ if path.startswith("file://"):
243
+ path = path[7:]
244
+ elif path.startswith("file:"):
245
+ path = path[5:]
246
+ elif path.startswith("local://"):
247
+ path = path[8:]
248
+ elif path.startswith("local:"):
249
+ path = path[6:]
250
+
251
+ path = make_path_posix(path)
252
+ if os.sep != "/":
253
+ # This code-path is a stripped down version of
254
+ # > drive, path = ntpath.splitdrive(path)
255
+ if path[1:2] == ":":
256
+ # Absolute drive-letter path, e.g. X:\Windows
257
+ # Relative path with drive, e.g. X:Windows
258
+ drive, path = path[:2], path[2:]
259
+ elif path[:2] == "//":
260
+ # UNC drives, e.g. \\server\share or \\?\UNC\server\share
261
+ # Device drives, e.g. \\.\device or \\?\device
262
+ if (index1 := path.find("/", 2)) == -1 or (
263
+ index2 := path.find("/", index1 + 1)
264
+ ) == -1:
265
+ drive, path = path, ""
266
+ else:
267
+ drive, path = path[:index2], path[index2:]
268
+ else:
269
+ # Relative path, e.g. Windows
270
+ drive = ""
271
+
272
+ path = path.rstrip("/") or cls.root_marker
273
+ return drive + path
274
+
275
+ else:
276
+ return path.rstrip("/") or cls.root_marker
277
+
278
+ def _isfilestore(self):
279
+ # Inheriting from DaskFileSystem makes this False (S3, etc. were)
280
+ # the original motivation. But we are a posix-like file system.
281
+ # See https://github.com/dask/dask/issues/5526
282
+ return True
283
+
284
+ def chmod(self, path, mode):
285
+ path = stringify_path(path)
286
+ return os.chmod(path, mode)
287
+
288
+
289
+ def make_path_posix(path):
290
+ """Make path generic and absolute for current OS"""
291
+ if not isinstance(path, str):
292
+ if isinstance(path, (list, set, tuple)):
293
+ return type(path)(make_path_posix(p) for p in path)
294
+ else:
295
+ path = stringify_path(path)
296
+ if not isinstance(path, str):
297
+ raise TypeError(f"could not convert {path!r} to string")
298
+ if os.sep == "/":
299
+ # Native posix
300
+ if path.startswith("/"):
301
+ # most common fast case for posix
302
+ return path
303
+ elif path.startswith("~"):
304
+ return osp.expanduser(path)
305
+ elif path.startswith("./"):
306
+ path = path[2:]
307
+ elif path == ".":
308
+ path = ""
309
+ return f"{os.getcwd()}/{path}"
310
+ else:
311
+ # NT handling
312
+ if path[0:1] == "/" and path[2:3] == ":":
313
+ # path is like "/c:/local/path"
314
+ path = path[1:]
315
+ if path[1:2] == ":":
316
+ # windows full path like "C:\\local\\path"
317
+ if len(path) <= 3:
318
+ # nt root (something like c:/)
319
+ return path[0] + ":/"
320
+ path = path.replace("\\", "/")
321
+ return path
322
+ elif path[0:1] == "~":
323
+ return make_path_posix(osp.expanduser(path))
324
+ elif path.startswith(("\\\\", "//")):
325
+ # windows UNC/DFS-style paths
326
+ return "//" + path[2:].replace("\\", "/")
327
+ elif path.startswith(("\\", "/")):
328
+ # windows relative path with root
329
+ path = path.replace("\\", "/")
330
+ return f"{osp.splitdrive(os.getcwd())[0]}{path}"
331
+ else:
332
+ path = path.replace("\\", "/")
333
+ if path.startswith("./"):
334
+ path = path[2:]
335
+ elif path == ".":
336
+ path = ""
337
+ return f"{make_path_posix(os.getcwd())}/{path}"
338
+
339
+
340
+ def trailing_sep(path):
341
+ """Return True if the path ends with a path separator.
342
+
343
+ A forward slash is always considered a path separator, even on Operating
344
+ Systems that normally use a backslash.
345
+ """
346
+ # TODO: if all incoming paths were posix-compliant then separator would
347
+ # always be a forward slash, simplifying this function.
348
+ # See https://github.com/fsspec/filesystem_spec/pull/1250
349
+ return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
350
+
351
+
352
+ class LocalFileOpener(io.IOBase):
353
+ def __init__(
354
+ self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
355
+ ):
356
+ logger.debug("open file: %s", path)
357
+ self.path = path
358
+ self.mode = mode
359
+ self.fs = fs
360
+ self.f = None
361
+ self.autocommit = autocommit
362
+ self.compression = get_compression(path, compression)
363
+ self.blocksize = io.DEFAULT_BUFFER_SIZE
364
+ self._open()
365
+
366
+ def _open(self):
367
+ if self.f is None or self.f.closed:
368
+ if self.autocommit or "w" not in self.mode:
369
+ self.f = open(self.path, mode=self.mode)
370
+ if self.compression:
371
+ compress = compr[self.compression]
372
+ self.f = compress(self.f, mode=self.mode)
373
+ else:
374
+ # TODO: check if path is writable?
375
+ i, name = tempfile.mkstemp()
376
+ os.close(i) # we want normal open and normal buffered file
377
+ self.temp = name
378
+ self.f = open(name, mode=self.mode)
379
+ if "w" not in self.mode:
380
+ self.size = self.f.seek(0, 2)
381
+ self.f.seek(0)
382
+ self.f.size = self.size
383
+
384
+ def _fetch_range(self, start, end):
385
+ # probably only used by cached FS
386
+ if "r" not in self.mode:
387
+ raise ValueError
388
+ self._open()
389
+ self.f.seek(start)
390
+ return self.f.read(end - start)
391
+
392
+ def __setstate__(self, state):
393
+ self.f = None
394
+ loc = state.pop("loc", None)
395
+ self.__dict__.update(state)
396
+ if "r" in state["mode"]:
397
+ self.f = None
398
+ self._open()
399
+ self.f.seek(loc)
400
+
401
+ def __getstate__(self):
402
+ d = self.__dict__.copy()
403
+ d.pop("f")
404
+ if "r" in self.mode:
405
+ d["loc"] = self.f.tell()
406
+ else:
407
+ if not self.f.closed:
408
+ raise ValueError("Cannot serialise open write-mode local file")
409
+ return d
410
+
411
+ def commit(self):
412
+ if self.autocommit:
413
+ raise RuntimeError("Can only commit if not already set to autocommit")
414
+ shutil.move(self.temp, self.path)
415
+
416
+ def discard(self):
417
+ if self.autocommit:
418
+ raise RuntimeError("Cannot discard if set to autocommit")
419
+ os.remove(self.temp)
420
+
421
+ def readable(self) -> bool:
422
+ return True
423
+
424
+ def writable(self) -> bool:
425
+ return "r" not in self.mode
426
+
427
+ def read(self, *args, **kwargs):
428
+ return self.f.read(*args, **kwargs)
429
+
430
+ def write(self, *args, **kwargs):
431
+ return self.f.write(*args, **kwargs)
432
+
433
+ def tell(self, *args, **kwargs):
434
+ return self.f.tell(*args, **kwargs)
435
+
436
+ def seek(self, *args, **kwargs):
437
+ return self.f.seek(*args, **kwargs)
438
+
439
+ def seekable(self, *args, **kwargs):
440
+ return self.f.seekable(*args, **kwargs)
441
+
442
+ def readline(self, *args, **kwargs):
443
+ return self.f.readline(*args, **kwargs)
444
+
445
+ def readlines(self, *args, **kwargs):
446
+ return self.f.readlines(*args, **kwargs)
447
+
448
+ def close(self):
449
+ return self.f.close()
450
+
451
+ def truncate(self, size=None) -> int:
452
+ return self.f.truncate(size)
453
+
454
+ @property
455
+ def closed(self):
456
+ return self.f.closed
457
+
458
+ def fileno(self):
459
+ return self.raw.fileno()
460
+
461
+ def flush(self) -> None:
462
+ self.f.flush()
463
+
464
+ def __iter__(self):
465
+ return self.f.__iter__()
466
+
467
+ def __getattr__(self, item):
468
+ return getattr(self.f, item)
469
+
470
+ def __enter__(self):
471
+ self._incontext = True
472
+ return self
473
+
474
+ def __exit__(self, exc_type, exc_value, traceback):
475
+ self._incontext = False
476
+ self.f.__exit__(exc_type, exc_value, traceback)
.venv/lib/python3.11/site-packages/fsspec/implementations/reference.py ADDED
@@ -0,0 +1,1306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import collections
3
+ import io
4
+ import itertools
5
+ import logging
6
+ import math
7
+ import os
8
+ from functools import lru_cache
9
+ from itertools import chain
10
+ from typing import TYPE_CHECKING, Literal
11
+
12
+ import fsspec.core
13
+ from fsspec.spec import AbstractBufferedFile
14
+
15
+ try:
16
+ import ujson as json
17
+ except ImportError:
18
+ if not TYPE_CHECKING:
19
+ import json
20
+
21
+ from fsspec.asyn import AsyncFileSystem
22
+ from fsspec.callbacks import DEFAULT_CALLBACK
23
+ from fsspec.core import filesystem, open, split_protocol
24
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
25
+ from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
26
+
27
+ logger = logging.getLogger("fsspec.reference")
28
+
29
+
30
+ class ReferenceNotReachable(RuntimeError):
31
+ def __init__(self, reference, target, *args):
32
+ super().__init__(*args)
33
+ self.reference = reference
34
+ self.target = target
35
+
36
+ def __str__(self):
37
+ return f'Reference "{self.reference}" failed to fetch target {self.target}'
38
+
39
+
40
+ def _first(d):
41
+ return next(iter(d.values()))
42
+
43
+
44
+ def _prot_in_references(path, references):
45
+ ref = references.get(path)
46
+ if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
47
+ return split_protocol(ref[0])[0] if ref[0] else ref[0]
48
+
49
+
50
+ def _protocol_groups(paths, references):
51
+ if isinstance(paths, str):
52
+ return {_prot_in_references(paths, references): [paths]}
53
+ out = {}
54
+ for path in paths:
55
+ protocol = _prot_in_references(path, references)
56
+ out.setdefault(protocol, []).append(path)
57
+ return out
58
+
59
+
60
+ class RefsValuesView(collections.abc.ValuesView):
61
+ def __iter__(self):
62
+ for val in self._mapping.zmetadata.values():
63
+ yield json.dumps(val).encode()
64
+ yield from self._mapping._items.values()
65
+ for field in self._mapping.listdir():
66
+ chunk_sizes = self._mapping._get_chunk_sizes(field)
67
+ if len(chunk_sizes) == 0:
68
+ yield self._mapping[field + "/0"]
69
+ continue
70
+ yield from self._mapping._generate_all_records(field)
71
+
72
+
73
+ class RefsItemsView(collections.abc.ItemsView):
74
+ def __iter__(self):
75
+ return zip(self._mapping.keys(), self._mapping.values())
76
+
77
+
78
+ def ravel_multi_index(idx, sizes):
79
+ val = 0
80
+ mult = 1
81
+ for i, s in zip(idx[::-1], sizes[::-1]):
82
+ val += i * mult
83
+ mult *= s
84
+ return val
85
+
86
+
87
+ class LazyReferenceMapper(collections.abc.MutableMapping):
88
+ """This interface can be used to read/write references from Parquet stores.
89
+ It is not intended for other types of references.
90
+ It can be used with Kerchunk's MultiZarrToZarr method to combine
91
+ references into a parquet store.
92
+ Examples of this use-case can be found here:
93
+ https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
94
+
95
+ # import is class level to prevent numpy dep requirement for fsspec
96
+ @property
97
+ def np(self):
98
+ import numpy as np
99
+
100
+ return np
101
+
102
+ @property
103
+ def pd(self):
104
+ import pandas as pd
105
+
106
+ return pd
107
+
108
+ def __init__(
109
+ self,
110
+ root,
111
+ fs=None,
112
+ out_root=None,
113
+ cache_size=128,
114
+ categorical_threshold=10,
115
+ engine: Literal["fastparquet", "pyarrow"] = "fastparquet",
116
+ ):
117
+ """
118
+
119
+ This instance will be writable, storing changes in memory until full partitions
120
+ are accumulated or .flush() is called.
121
+
122
+ To create an empty lazy store, use .create()
123
+
124
+ Parameters
125
+ ----------
126
+ root : str
127
+ Root of parquet store
128
+ fs : fsspec.AbstractFileSystem
129
+ fsspec filesystem object, default is local filesystem.
130
+ cache_size : int, default=128
131
+ Maximum size of LRU cache, where cache_size*record_size denotes
132
+ the total number of references that can be loaded in memory at once.
133
+ categorical_threshold : int
134
+ Encode urls as pandas.Categorical to reduce memory footprint if the ratio
135
+ of the number of unique urls to total number of refs for each variable
136
+ is greater than or equal to this number. (default 10)
137
+ engine: Literal["fastparquet","pyarrow"]
138
+ Engine choice for reading parquet files. (default is "fastparquet")
139
+ """
140
+
141
+ self.root = root
142
+ self.chunk_sizes = {}
143
+ self.out_root = out_root or self.root
144
+ self.cat_thresh = categorical_threshold
145
+ self.engine = engine
146
+ self.cache_size = cache_size
147
+ self.url = self.root + "/{field}/refs.{record}.parq"
148
+ # TODO: derive fs from `root`
149
+ self.fs = fsspec.filesystem("file") if fs is None else fs
150
+
151
+ from importlib.util import find_spec
152
+
153
+ if self.engine == "pyarrow" and find_spec("pyarrow") is None:
154
+ raise ImportError("engine choice `pyarrow` is not installed.")
155
+
156
+ def __getattr__(self, item):
157
+ if item in ("_items", "record_size", "zmetadata"):
158
+ self.setup()
159
+ # avoid possible recursion if setup fails somehow
160
+ return self.__dict__[item]
161
+ raise AttributeError(item)
162
+
163
+ def setup(self):
164
+ self._items = {}
165
+ self._items[".zmetadata"] = self.fs.cat_file(
166
+ "/".join([self.root, ".zmetadata"])
167
+ )
168
+ met = json.loads(self._items[".zmetadata"])
169
+ self.record_size = met["record_size"]
170
+ self.zmetadata = met["metadata"]
171
+
172
+ # Define function to open and decompress refs
173
+ @lru_cache(maxsize=self.cache_size)
174
+ def open_refs(field, record):
175
+ """cached parquet file loader"""
176
+ path = self.url.format(field=field, record=record)
177
+ data = io.BytesIO(self.fs.cat_file(path))
178
+ try:
179
+ df = self.pd.read_parquet(data, engine=self.engine)
180
+ refs = {c: df[c].to_numpy() for c in df.columns}
181
+ except OSError:
182
+ refs = None
183
+ return refs
184
+
185
+ self.open_refs = open_refs
186
+
187
+ @staticmethod
188
+ def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
189
+ """Make empty parquet reference set
190
+
191
+ First deletes the contents of the given directory, if it exists.
192
+
193
+ Parameters
194
+ ----------
195
+ root: str
196
+ Directory to contain the output; will be created
197
+ storage_options: dict | None
198
+ For making the filesystem to use for writing is fs is None
199
+ fs: FileSystem | None
200
+ Filesystem for writing
201
+ record_size: int
202
+ Number of references per parquet file
203
+ kwargs: passed to __init__
204
+
205
+ Returns
206
+ -------
207
+ LazyReferenceMapper instance
208
+ """
209
+ met = {"metadata": {}, "record_size": record_size}
210
+ if fs is None:
211
+ fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
212
+ if fs.exists(root):
213
+ fs.rm(root, recursive=True)
214
+ fs.makedirs(root, exist_ok=True)
215
+ fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
216
+ return LazyReferenceMapper(root, fs, **kwargs)
217
+
218
+ @lru_cache()
219
+ def listdir(self):
220
+ """List top-level directories"""
221
+ dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
222
+ return set(dirs)
223
+
224
+ def ls(self, path="", detail=True):
225
+ """Shortcut file listings"""
226
+ path = path.rstrip("/")
227
+ pathdash = path + "/" if path else ""
228
+ dirnames = self.listdir()
229
+ dirs = [
230
+ d
231
+ for d in dirnames
232
+ if d.startswith(pathdash) and "/" not in d.lstrip(pathdash)
233
+ ]
234
+ if dirs:
235
+ others = {
236
+ f
237
+ for f in chain(
238
+ [".zmetadata"],
239
+ (name for name in self.zmetadata),
240
+ (name for name in self._items),
241
+ )
242
+ if f.startswith(pathdash) and "/" not in f.lstrip(pathdash)
243
+ }
244
+ if detail is False:
245
+ others.update(dirs)
246
+ return sorted(others)
247
+ dirinfo = [{"name": name, "type": "directory", "size": 0} for name in dirs]
248
+ fileinfo = [
249
+ {
250
+ "name": name,
251
+ "type": "file",
252
+ "size": len(
253
+ json.dumps(self.zmetadata[name])
254
+ if name in self.zmetadata
255
+ else self._items[name]
256
+ ),
257
+ }
258
+ for name in others
259
+ ]
260
+ return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
261
+ field = path
262
+ others = set(
263
+ [name for name in self.zmetadata if name.startswith(f"{path}/")]
264
+ + [name for name in self._items if name.startswith(f"{path}/")]
265
+ )
266
+ fileinfo = [
267
+ {
268
+ "name": name,
269
+ "type": "file",
270
+ "size": len(
271
+ json.dumps(self.zmetadata[name])
272
+ if name in self.zmetadata
273
+ else self._items[name]
274
+ ),
275
+ }
276
+ for name in others
277
+ ]
278
+ keys = self._keys_in_field(field)
279
+
280
+ if detail is False:
281
+ return list(others) + list(keys)
282
+ recs = self._generate_all_records(field)
283
+ recinfo = [
284
+ {"name": name, "type": "file", "size": rec[-1]}
285
+ for name, rec in zip(keys, recs)
286
+ if rec[0] # filters out path==None, deleted/missing
287
+ ]
288
+ return fileinfo + recinfo
289
+
290
+ def _load_one_key(self, key):
291
+ """Get the reference for one key
292
+
293
+ Returns bytes, one-element list or three-element list.
294
+ """
295
+ if key in self._items:
296
+ return self._items[key]
297
+ elif key in self.zmetadata:
298
+ return json.dumps(self.zmetadata[key]).encode()
299
+ elif "/" not in key or self._is_meta(key):
300
+ raise KeyError(key)
301
+ field, _ = key.rsplit("/", 1)
302
+ record, ri, chunk_size = self._key_to_record(key)
303
+ maybe = self._items.get((field, record), {}).get(ri, False)
304
+ if maybe is None:
305
+ # explicitly deleted
306
+ raise KeyError
307
+ elif maybe:
308
+ return maybe
309
+ elif chunk_size == 0:
310
+ return b""
311
+
312
+ # Chunk keys can be loaded from row group and cached in LRU cache
313
+ try:
314
+ refs = self.open_refs(field, record)
315
+ except (ValueError, TypeError, FileNotFoundError) as exc:
316
+ raise KeyError(key) from exc
317
+ columns = ["path", "offset", "size", "raw"]
318
+ selection = [refs[c][ri] if c in refs else None for c in columns]
319
+ raw = selection[-1]
320
+ if raw is not None:
321
+ return raw
322
+ if selection[0] is None:
323
+ raise KeyError("This reference does not exist or has been deleted")
324
+ if selection[1:3] == [0, 0]:
325
+ # URL only
326
+ return selection[:1]
327
+ # URL, offset, size
328
+ return selection[:3]
329
+
330
+ @lru_cache(4096)
331
+ def _key_to_record(self, key):
332
+ """Details needed to construct a reference for one key"""
333
+ field, chunk = key.rsplit("/", 1)
334
+ chunk_sizes = self._get_chunk_sizes(field)
335
+ if len(chunk_sizes) == 0:
336
+ return 0, 0, 0
337
+ chunk_idx = [int(c) for c in chunk.split(".")]
338
+ chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
339
+ record = chunk_number // self.record_size
340
+ ri = chunk_number % self.record_size
341
+ return record, ri, len(chunk_sizes)
342
+
343
+ def _get_chunk_sizes(self, field):
344
+ """The number of chunks along each axis for a given field"""
345
+ if field not in self.chunk_sizes:
346
+ zarray = self.zmetadata[f"{field}/.zarray"]
347
+ size_ratio = [
348
+ math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
349
+ ]
350
+ self.chunk_sizes[field] = size_ratio or [1]
351
+ return self.chunk_sizes[field]
352
+
353
+ def _generate_record(self, field, record):
354
+ """The references for a given parquet file of a given field"""
355
+ refs = self.open_refs(field, record)
356
+ it = iter(zip(*refs.values()))
357
+ if len(refs) == 3:
358
+ # All urls
359
+ return (list(t) for t in it)
360
+ elif len(refs) == 1:
361
+ # All raws
362
+ return refs["raw"]
363
+ else:
364
+ # Mix of urls and raws
365
+ return (list(t[:3]) if not t[3] else t[3] for t in it)
366
+
367
+ def _generate_all_records(self, field):
368
+ """Load all the references within a field by iterating over the parquet files"""
369
+ nrec = 1
370
+ for ch in self._get_chunk_sizes(field):
371
+ nrec *= ch
372
+ nrec = math.ceil(nrec / self.record_size)
373
+ for record in range(nrec):
374
+ yield from self._generate_record(field, record)
375
+
376
+ def values(self):
377
+ return RefsValuesView(self)
378
+
379
+ def items(self):
380
+ return RefsItemsView(self)
381
+
382
+ def __hash__(self):
383
+ return id(self)
384
+
385
+ def __getitem__(self, key):
386
+ return self._load_one_key(key)
387
+
388
+ def __setitem__(self, key, value):
389
+ if "/" in key and not self._is_meta(key):
390
+ field, chunk = key.rsplit("/", 1)
391
+ record, i, _ = self._key_to_record(key)
392
+ subdict = self._items.setdefault((field, record), {})
393
+ subdict[i] = value
394
+ if len(subdict) == self.record_size:
395
+ self.write(field, record)
396
+ else:
397
+ # metadata or top-level
398
+ if hasattr(value, "to_bytes"):
399
+ val = value.to_bytes().decode()
400
+ elif isinstance(value, bytes):
401
+ val = value.decode()
402
+ else:
403
+ val = value
404
+ self._items[key] = val
405
+ new_value = json.loads(val)
406
+ self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
407
+
408
+ @staticmethod
409
+ def _is_meta(key):
410
+ return key.startswith(".z") or "/.z" in key
411
+
412
+ def __delitem__(self, key):
413
+ if key in self._items:
414
+ del self._items[key]
415
+ elif key in self.zmetadata:
416
+ del self.zmetadata[key]
417
+ else:
418
+ if "/" in key and not self._is_meta(key):
419
+ field, _ = key.rsplit("/", 1)
420
+ record, i, _ = self._key_to_record(key)
421
+ subdict = self._items.setdefault((field, record), {})
422
+ subdict[i] = None
423
+ if len(subdict) == self.record_size:
424
+ self.write(field, record)
425
+ else:
426
+ # metadata or top-level
427
+ self._items[key] = None
428
+
429
+ def write(self, field, record, base_url=None, storage_options=None):
430
+ # extra requirements if writing
431
+ import kerchunk.df
432
+ import numpy as np
433
+ import pandas as pd
434
+
435
+ partition = self._items[(field, record)]
436
+ original = False
437
+ if len(partition) < self.record_size:
438
+ try:
439
+ original = self.open_refs(field, record)
440
+ except OSError:
441
+ pass
442
+
443
+ if original:
444
+ paths = original["path"]
445
+ offsets = original["offset"]
446
+ sizes = original["size"]
447
+ raws = original["raw"]
448
+ else:
449
+ paths = np.full(self.record_size, np.nan, dtype="O")
450
+ offsets = np.zeros(self.record_size, dtype="int64")
451
+ sizes = np.zeros(self.record_size, dtype="int64")
452
+ raws = np.full(self.record_size, np.nan, dtype="O")
453
+ for j, data in partition.items():
454
+ if isinstance(data, list):
455
+ if (
456
+ str(paths.dtype) == "category"
457
+ and data[0] not in paths.dtype.categories
458
+ ):
459
+ paths = paths.add_categories(data[0])
460
+ paths[j] = data[0]
461
+ if len(data) > 1:
462
+ offsets[j] = data[1]
463
+ sizes[j] = data[2]
464
+ elif data is None:
465
+ # delete
466
+ paths[j] = None
467
+ offsets[j] = 0
468
+ sizes[j] = 0
469
+ raws[j] = None
470
+ else:
471
+ # this is the only call into kerchunk, could remove
472
+ raws[j] = kerchunk.df._proc_raw(data)
473
+ # TODO: only save needed columns
474
+ df = pd.DataFrame(
475
+ {
476
+ "path": paths,
477
+ "offset": offsets,
478
+ "size": sizes,
479
+ "raw": raws,
480
+ },
481
+ copy=False,
482
+ )
483
+ if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
484
+ df["path"] = df["path"].astype("category")
485
+ object_encoding = {"raw": "bytes", "path": "utf8"}
486
+ has_nulls = ["path", "raw"]
487
+
488
+ fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
489
+ self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
490
+
491
+ if self.engine == "pyarrow":
492
+ df_backend_kwargs = {"write_statistics": False}
493
+ elif self.engine == "fastparquet":
494
+ df_backend_kwargs = {
495
+ "stats": False,
496
+ "object_encoding": object_encoding,
497
+ "has_nulls": has_nulls,
498
+ }
499
+ else:
500
+ raise NotImplementedError(f"{self.engine} not supported")
501
+
502
+ df.to_parquet(
503
+ fn,
504
+ engine=self.engine,
505
+ storage_options=storage_options
506
+ or getattr(self.fs, "storage_options", None),
507
+ compression="zstd",
508
+ index=False,
509
+ **df_backend_kwargs,
510
+ )
511
+
512
+ partition.clear()
513
+ self._items.pop((field, record))
514
+
515
+ def flush(self, base_url=None, storage_options=None):
516
+ """Output any modified or deleted keys
517
+
518
+ Parameters
519
+ ----------
520
+ base_url: str
521
+ Location of the output
522
+ """
523
+
524
+ # write what we have so far and clear sub chunks
525
+ for thing in list(self._items):
526
+ if isinstance(thing, tuple):
527
+ field, record = thing
528
+ self.write(
529
+ field,
530
+ record,
531
+ base_url=base_url,
532
+ storage_options=storage_options,
533
+ )
534
+
535
+ # gather .zmetadata from self._items and write that too
536
+ for k in list(self._items):
537
+ if k != ".zmetadata" and ".z" in k:
538
+ self.zmetadata[k] = json.loads(self._items.pop(k))
539
+ met = {"metadata": self.zmetadata, "record_size": self.record_size}
540
+ self._items.clear()
541
+ self._items[".zmetadata"] = json.dumps(met).encode()
542
+ self.fs.pipe(
543
+ "/".join([base_url or self.out_root, ".zmetadata"]),
544
+ self._items[".zmetadata"],
545
+ )
546
+
547
+ # TODO: only clear those that we wrote to?
548
+ self.open_refs.cache_clear()
549
+
550
+ def __len__(self):
551
+ # Caveat: This counts expected references, not actual - but is fast
552
+ count = 0
553
+ for field in self.listdir():
554
+ if field.startswith("."):
555
+ count += 1
556
+ else:
557
+ count += math.prod(self._get_chunk_sizes(field))
558
+ count += len(self.zmetadata) # all metadata keys
559
+ # any other files not in reference partitions
560
+ count += sum(1 for _ in self._items if not isinstance(_, tuple))
561
+ return count
562
+
563
+ def __iter__(self):
564
+ # Caveat: returns only existing keys, so the number of these does not
565
+ # match len(self)
566
+ metas = set(self.zmetadata)
567
+ metas.update(self._items)
568
+ for bit in metas:
569
+ if isinstance(bit, str):
570
+ yield bit
571
+ for field in self.listdir():
572
+ for k in self._keys_in_field(field):
573
+ if k in self:
574
+ yield k
575
+
576
+ def __contains__(self, item):
577
+ try:
578
+ self._load_one_key(item)
579
+ return True
580
+ except KeyError:
581
+ return False
582
+
583
+ def _keys_in_field(self, field):
584
+ """List key names in given field
585
+
586
+ Produces strings like "field/x.y" appropriate from the chunking of the array
587
+ """
588
+ chunk_sizes = self._get_chunk_sizes(field)
589
+ if len(chunk_sizes) == 0:
590
+ yield field + "/0"
591
+ return
592
+ inds = itertools.product(*(range(i) for i in chunk_sizes))
593
+ for ind in inds:
594
+ yield field + "/" + ".".join([str(c) for c in ind])
595
+
596
+
597
+ class ReferenceFileSystem(AsyncFileSystem):
598
+ """View byte ranges of some other file as a file system
599
+ Initial version: single file system target, which must support
600
+ async, and must allow start and end args in _cat_file. Later versions
601
+ may allow multiple arbitrary URLs for the targets.
602
+ This FileSystem is read-only. It is designed to be used with async
603
+ targets (for now). We do not get original file details from the target FS.
604
+ Configuration is by passing a dict of references at init, or a URL to
605
+ a JSON file containing the same; this dict
606
+ can also contain concrete data for some set of paths.
607
+ Reference dict format:
608
+ {path0: bytes_data, path1: (target_url, offset, size)}
609
+ https://github.com/fsspec/kerchunk/blob/main/README.md
610
+ """
611
+
612
+ protocol = "reference"
613
+ cachable = False
614
+
615
+ def __init__(
616
+ self,
617
+ fo,
618
+ target=None,
619
+ ref_storage_args=None,
620
+ target_protocol=None,
621
+ target_options=None,
622
+ remote_protocol=None,
623
+ remote_options=None,
624
+ fs=None,
625
+ template_overrides=None,
626
+ simple_templates=True,
627
+ max_gap=64_000,
628
+ max_block=256_000_000,
629
+ cache_size=128,
630
+ **kwargs,
631
+ ):
632
+ """
633
+ Parameters
634
+ ----------
635
+ fo : dict or str
636
+ The set of references to use for this instance, with a structure as above.
637
+ If str referencing a JSON file, will use fsspec.open, in conjunction
638
+ with target_options and target_protocol to open and parse JSON at this
639
+ location. If a directory, then assume references are a set of parquet
640
+ files to be loaded lazily.
641
+ target : str
642
+ For any references having target_url as None, this is the default file
643
+ target to use
644
+ ref_storage_args : dict
645
+ If references is a str, use these kwargs for loading the JSON file.
646
+ Deprecated: use target_options instead.
647
+ target_protocol : str
648
+ Used for loading the reference file, if it is a path. If None, protocol
649
+ will be derived from the given path
650
+ target_options : dict
651
+ Extra FS options for loading the reference file ``fo``, if given as a path
652
+ remote_protocol : str
653
+ The protocol of the filesystem on which the references will be evaluated
654
+ (unless fs is provided). If not given, will be derived from the first
655
+ URL that has a protocol in the templates or in the references, in that
656
+ order.
657
+ remote_options : dict
658
+ kwargs to go with remote_protocol
659
+ fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
660
+ Directly provide a file system(s):
661
+ - a single filesystem instance
662
+ - a dict of protocol:filesystem, where each value is either a filesystem
663
+ instance, or a dict of kwargs that can be used to create in
664
+ instance for the given protocol
665
+
666
+ If this is given, remote_options and remote_protocol are ignored.
667
+ template_overrides : dict
668
+ Swap out any templates in the references file with these - useful for
669
+ testing.
670
+ simple_templates: bool
671
+ Whether templates can be processed with simple replace (True) or if
672
+ jinja is needed (False, much slower). All reference sets produced by
673
+ ``kerchunk`` are simple in this sense, but the spec allows for complex.
674
+ max_gap, max_block: int
675
+ For merging multiple concurrent requests to the same remote file.
676
+ Neighboring byte ranges will only be merged when their
677
+ inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
678
+ to only merge when it requires no extra bytes. Pass a negative
679
+ number to disable merging, appropriate for local target files.
680
+ Neighboring byte ranges will only be merged when the size of
681
+ the aggregated range is <= ``max_block``. Default is 256MB.
682
+ cache_size : int
683
+ Maximum size of LRU cache, where cache_size*record_size denotes
684
+ the total number of references that can be loaded in memory at once.
685
+ Only used for lazily loaded references.
686
+ kwargs : passed to parent class
687
+ """
688
+ super().__init__(**kwargs)
689
+ self.target = target
690
+ self.template_overrides = template_overrides
691
+ self.simple_templates = simple_templates
692
+ self.templates = {}
693
+ self.fss = {}
694
+ self._dircache = {}
695
+ self.max_gap = max_gap
696
+ self.max_block = max_block
697
+ if isinstance(fo, str):
698
+ dic = dict(
699
+ **(ref_storage_args or target_options or {}), protocol=target_protocol
700
+ )
701
+ ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
702
+ if ref_fs.isfile(fo2):
703
+ # text JSON
704
+ with fsspec.open(fo, "rb", **dic) as f:
705
+ logger.info("Read reference from URL %s", fo)
706
+ text = json.load(f)
707
+ self._process_references(text, template_overrides)
708
+ else:
709
+ # Lazy parquet refs
710
+ logger.info("Open lazy reference dict from URL %s", fo)
711
+ self.references = LazyReferenceMapper(
712
+ fo2,
713
+ fs=ref_fs,
714
+ cache_size=cache_size,
715
+ )
716
+ else:
717
+ # dictionaries
718
+ self._process_references(fo, template_overrides)
719
+ if isinstance(fs, dict):
720
+ self.fss = {
721
+ k: (
722
+ fsspec.filesystem(k.split(":", 1)[0], **opts)
723
+ if isinstance(opts, dict)
724
+ else opts
725
+ )
726
+ for k, opts in fs.items()
727
+ }
728
+ if None not in self.fss:
729
+ self.fss[None] = filesystem("file")
730
+ return
731
+ if fs is not None:
732
+ # single remote FS
733
+ remote_protocol = (
734
+ fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
735
+ )
736
+ self.fss[remote_protocol] = fs
737
+
738
+ if remote_protocol is None:
739
+ # get single protocol from any templates
740
+ for ref in self.templates.values():
741
+ if callable(ref):
742
+ ref = ref()
743
+ protocol, _ = fsspec.core.split_protocol(ref)
744
+ if protocol and protocol not in self.fss:
745
+ fs = filesystem(protocol, **(remote_options or {}))
746
+ self.fss[protocol] = fs
747
+ if remote_protocol is None:
748
+ # get single protocol from references
749
+ # TODO: warning here, since this can be very expensive?
750
+ for ref in self.references.values():
751
+ if callable(ref):
752
+ ref = ref()
753
+ if isinstance(ref, list) and ref[0]:
754
+ protocol, _ = fsspec.core.split_protocol(ref[0])
755
+ if protocol not in self.fss:
756
+ fs = filesystem(protocol, **(remote_options or {}))
757
+ self.fss[protocol] = fs
758
+ # only use first remote URL
759
+ break
760
+
761
+ if remote_protocol and remote_protocol not in self.fss:
762
+ fs = filesystem(remote_protocol, **(remote_options or {}))
763
+ self.fss[remote_protocol] = fs
764
+
765
+ self.fss[None] = fs or filesystem("file") # default one
766
+ # Wrap any non-async filesystems to ensure async methods are available below
767
+ for k, f in self.fss.items():
768
+ if not f.async_impl:
769
+ self.fss[k] = AsyncFileSystemWrapper(f)
770
+ elif self.asynchronous ^ f.asynchronous:
771
+ raise ValueError(
772
+ "Reference-FS's target filesystem must have same value"
773
+ "of asynchronous"
774
+ )
775
+
776
+ def _cat_common(self, path, start=None, end=None):
777
+ path = self._strip_protocol(path)
778
+ logger.debug(f"cat: {path}")
779
+ try:
780
+ part = self.references[path]
781
+ except KeyError as exc:
782
+ raise FileNotFoundError(path) from exc
783
+ if isinstance(part, str):
784
+ part = part.encode()
785
+ if hasattr(part, "to_bytes"):
786
+ part = part.to_bytes()
787
+ if isinstance(part, bytes):
788
+ logger.debug(f"Reference: {path}, type bytes")
789
+ if part.startswith(b"base64:"):
790
+ part = base64.b64decode(part[7:])
791
+ return part, None, None
792
+
793
+ if len(part) == 1:
794
+ logger.debug(f"Reference: {path}, whole file => {part}")
795
+ url = part[0]
796
+ start1, end1 = start, end
797
+ else:
798
+ url, start0, size = part
799
+ logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
800
+ end0 = start0 + size
801
+
802
+ if start is not None:
803
+ if start >= 0:
804
+ start1 = start0 + start
805
+ else:
806
+ start1 = end0 + start
807
+ else:
808
+ start1 = start0
809
+ if end is not None:
810
+ if end >= 0:
811
+ end1 = start0 + end
812
+ else:
813
+ end1 = end0 + end
814
+ else:
815
+ end1 = end0
816
+ if url is None:
817
+ url = self.target
818
+ return url, start1, end1
819
+
820
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
821
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
822
+ if isinstance(part_or_url, bytes):
823
+ return part_or_url[start:end]
824
+ protocol, _ = split_protocol(part_or_url)
825
+ try:
826
+ return await self.fss[protocol]._cat_file(
827
+ part_or_url, start=start0, end=end0
828
+ )
829
+ except Exception as e:
830
+ raise ReferenceNotReachable(path, part_or_url) from e
831
+
832
+ def cat_file(self, path, start=None, end=None, **kwargs):
833
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
834
+ if isinstance(part_or_url, bytes):
835
+ return part_or_url[start:end]
836
+ protocol, _ = split_protocol(part_or_url)
837
+ try:
838
+ return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
839
+ except Exception as e:
840
+ raise ReferenceNotReachable(path, part_or_url) from e
841
+
842
+ def pipe_file(self, path, value, **_):
843
+ """Temporarily add binary data or reference as a file"""
844
+ self.references[path] = value
845
+
846
+ async def _get_file(self, rpath, lpath, **kwargs):
847
+ if self.isdir(rpath):
848
+ return os.makedirs(lpath, exist_ok=True)
849
+ data = await self._cat_file(rpath)
850
+ with open(lpath, "wb") as f:
851
+ f.write(data)
852
+
853
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
854
+ if self.isdir(rpath):
855
+ return os.makedirs(lpath, exist_ok=True)
856
+ data = self.cat_file(rpath, **kwargs)
857
+ callback.set_size(len(data))
858
+ if isfilelike(lpath):
859
+ lpath.write(data)
860
+ else:
861
+ with open(lpath, "wb") as f:
862
+ f.write(data)
863
+ callback.absolute_update(len(data))
864
+
865
+ def get(self, rpath, lpath, recursive=False, **kwargs):
866
+ if recursive:
867
+ # trigger directory build
868
+ self.ls("")
869
+ rpath = self.expand_path(rpath, recursive=recursive)
870
+ fs = fsspec.filesystem("file", auto_mkdir=True)
871
+ targets = other_paths(rpath, lpath)
872
+ if recursive:
873
+ data = self.cat([r for r in rpath if not self.isdir(r)])
874
+ else:
875
+ data = self.cat(rpath)
876
+ for remote, local in zip(rpath, targets):
877
+ if remote in data:
878
+ fs.pipe_file(local, data[remote])
879
+
880
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
881
+ if isinstance(path, str) and recursive:
882
+ raise NotImplementedError
883
+ if isinstance(path, list) and (recursive or any("*" in p for p in path)):
884
+ raise NotImplementedError
885
+ # TODO: if references is lazy, pre-fetch all paths in batch before access
886
+ proto_dict = _protocol_groups(path, self.references)
887
+ out = {}
888
+ for proto, paths in proto_dict.items():
889
+ fs = self.fss[proto]
890
+ urls, starts, ends, valid_paths = [], [], [], []
891
+ for p in paths:
892
+ # find references or label not-found. Early exit if any not
893
+ # found and on_error is "raise"
894
+ try:
895
+ u, s, e = self._cat_common(p)
896
+ if not isinstance(u, (bytes, str)):
897
+ # nan/None from parquet
898
+ continue
899
+ except FileNotFoundError as err:
900
+ if on_error == "raise":
901
+ raise
902
+ if on_error != "omit":
903
+ out[p] = err
904
+ else:
905
+ urls.append(u)
906
+ starts.append(s)
907
+ ends.append(e)
908
+ valid_paths.append(p)
909
+
910
+ # process references into form for merging
911
+ urls2 = []
912
+ starts2 = []
913
+ ends2 = []
914
+ paths2 = []
915
+ whole_files = set()
916
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
917
+ if isinstance(u, bytes):
918
+ # data
919
+ out[p] = u
920
+ elif s is None:
921
+ # whole file - limits are None, None, but no further
922
+ # entries take for this file
923
+ whole_files.add(u)
924
+ urls2.append(u)
925
+ starts2.append(s)
926
+ ends2.append(e)
927
+ paths2.append(p)
928
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
929
+ # second run to account for files that are to be loaded whole
930
+ if s is not None and u not in whole_files:
931
+ urls2.append(u)
932
+ starts2.append(s)
933
+ ends2.append(e)
934
+ paths2.append(p)
935
+
936
+ # merge and fetch consolidated ranges
937
+ new_paths, new_starts, new_ends = merge_offset_ranges(
938
+ list(urls2),
939
+ list(starts2),
940
+ list(ends2),
941
+ sort=True,
942
+ max_gap=self.max_gap,
943
+ max_block=self.max_block,
944
+ )
945
+ bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
946
+
947
+ # unbundle from merged bytes - simple approach
948
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
949
+ if p in out:
950
+ continue # was bytes, already handled
951
+ for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
952
+ if np == u and (ns is None or ne is None):
953
+ if isinstance(b, Exception):
954
+ out[p] = b
955
+ else:
956
+ out[p] = b[s:e]
957
+ elif np == u and s >= ns and e <= ne:
958
+ if isinstance(b, Exception):
959
+ out[p] = b
960
+ else:
961
+ out[p] = b[s - ns : (e - ne) or None]
962
+
963
+ for k, v in out.copy().items():
964
+ # these were valid references, but fetch failed, so transform exc
965
+ if isinstance(v, Exception) and k in self.references:
966
+ ex = out[k]
967
+ new_ex = ReferenceNotReachable(k, self.references[k])
968
+ new_ex.__cause__ = ex
969
+ if on_error == "raise":
970
+ raise new_ex
971
+ elif on_error != "omit":
972
+ out[k] = new_ex
973
+
974
+ if len(out) == 1 and isinstance(path, str) and "*" not in path:
975
+ return _first(out)
976
+ return out
977
+
978
+ def _process_references(self, references, template_overrides=None):
979
+ vers = references.get("version", None)
980
+ if vers is None:
981
+ self._process_references0(references)
982
+ elif vers == 1:
983
+ self._process_references1(references, template_overrides=template_overrides)
984
+ else:
985
+ raise ValueError(f"Unknown reference spec version: {vers}")
986
+ # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
987
+ # can replace with programmatic. Is it even needed for mapper interface?
988
+
989
+ def _process_references0(self, references):
990
+ """Make reference dict for Spec Version 0"""
991
+ if isinstance(references, dict):
992
+ # do not do this for lazy/parquet backend, which will not make dicts,
993
+ # but must remain writable in the original object
994
+ references = {
995
+ key: json.dumps(val) if isinstance(val, dict) else val
996
+ for key, val in references.items()
997
+ }
998
+ self.references = references
999
+
1000
+ def _process_references1(self, references, template_overrides=None):
1001
+ if not self.simple_templates or self.templates:
1002
+ import jinja2
1003
+ self.references = {}
1004
+ self._process_templates(references.get("templates", {}))
1005
+
1006
+ @lru_cache(1000)
1007
+ def _render_jinja(u):
1008
+ return jinja2.Template(u).render(**self.templates)
1009
+
1010
+ for k, v in references.get("refs", {}).items():
1011
+ if isinstance(v, str):
1012
+ if v.startswith("base64:"):
1013
+ self.references[k] = base64.b64decode(v[7:])
1014
+ self.references[k] = v
1015
+ elif isinstance(v, dict):
1016
+ self.references[k] = json.dumps(v)
1017
+ elif self.templates:
1018
+ u = v[0]
1019
+ if "{{" in u:
1020
+ if self.simple_templates:
1021
+ u = (
1022
+ u.replace("{{", "{")
1023
+ .replace("}}", "}")
1024
+ .format(**self.templates)
1025
+ )
1026
+ else:
1027
+ u = _render_jinja(u)
1028
+ self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
1029
+ else:
1030
+ self.references[k] = v
1031
+ self.references.update(self._process_gen(references.get("gen", [])))
1032
+
1033
+ def _process_templates(self, tmp):
1034
+ self.templates = {}
1035
+ if self.template_overrides is not None:
1036
+ tmp.update(self.template_overrides)
1037
+ for k, v in tmp.items():
1038
+ if "{{" in v:
1039
+ import jinja2
1040
+
1041
+ self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
1042
+ temp
1043
+ ).render(**kwargs)
1044
+ else:
1045
+ self.templates[k] = v
1046
+
1047
+ def _process_gen(self, gens):
1048
+ out = {}
1049
+ for gen in gens:
1050
+ dimension = {
1051
+ k: (
1052
+ v
1053
+ if isinstance(v, list)
1054
+ else range(v.get("start", 0), v["stop"], v.get("step", 1))
1055
+ )
1056
+ for k, v in gen["dimensions"].items()
1057
+ }
1058
+ products = (
1059
+ dict(zip(dimension.keys(), values))
1060
+ for values in itertools.product(*dimension.values())
1061
+ )
1062
+ for pr in products:
1063
+ import jinja2
1064
+
1065
+ key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
1066
+ url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
1067
+ if ("offset" in gen) and ("length" in gen):
1068
+ offset = int(
1069
+ jinja2.Template(gen["offset"]).render(**pr, **self.templates)
1070
+ )
1071
+ length = int(
1072
+ jinja2.Template(gen["length"]).render(**pr, **self.templates)
1073
+ )
1074
+ out[key] = [url, offset, length]
1075
+ elif ("offset" in gen) ^ ("length" in gen):
1076
+ raise ValueError(
1077
+ "Both 'offset' and 'length' are required for a "
1078
+ "reference generator entry if either is provided."
1079
+ )
1080
+ else:
1081
+ out[key] = [url]
1082
+ return out
1083
+
1084
+ def _dircache_from_items(self):
1085
+ self.dircache = {"": []}
1086
+ it = self.references.items()
1087
+ for path, part in it:
1088
+ if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
1089
+ size = len(part)
1090
+ elif len(part) == 1:
1091
+ size = None
1092
+ else:
1093
+ _, _, size = part
1094
+ par = path.rsplit("/", 1)[0] if "/" in path else ""
1095
+ par0 = par
1096
+ subdirs = [par0]
1097
+ while par0 and par0 not in self.dircache:
1098
+ # collect parent directories
1099
+ par0 = self._parent(par0)
1100
+ subdirs.append(par0)
1101
+
1102
+ subdirs.reverse()
1103
+ for parent, child in zip(subdirs, subdirs[1:]):
1104
+ # register newly discovered directories
1105
+ assert child not in self.dircache
1106
+ assert parent in self.dircache
1107
+ self.dircache[parent].append(
1108
+ {"name": child, "type": "directory", "size": 0}
1109
+ )
1110
+ self.dircache[child] = []
1111
+
1112
+ self.dircache[par].append({"name": path, "type": "file", "size": size})
1113
+
1114
+ def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
1115
+ part_or_url, start0, end0 = self._cat_common(path)
1116
+ # This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
1117
+ # That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
1118
+ if isinstance(part_or_url, bytes):
1119
+ return io.BytesIO(part_or_url[start0:end0])
1120
+
1121
+ protocol, _ = split_protocol(part_or_url)
1122
+ if start0 is None and end0 is None:
1123
+ return self.fss[protocol]._open(
1124
+ part_or_url,
1125
+ mode,
1126
+ block_size=block_size,
1127
+ cache_options=cache_options,
1128
+ **kwargs,
1129
+ )
1130
+
1131
+ return ReferenceFile(
1132
+ self,
1133
+ path,
1134
+ mode,
1135
+ block_size=block_size,
1136
+ cache_options=cache_options,
1137
+ **kwargs,
1138
+ )
1139
+
1140
+ def ls(self, path, detail=True, **kwargs):
1141
+ logger.debug("list %s", path)
1142
+ path = self._strip_protocol(path)
1143
+ if isinstance(self.references, LazyReferenceMapper):
1144
+ try:
1145
+ return self.references.ls(path, detail)
1146
+ except KeyError:
1147
+ pass
1148
+ raise FileNotFoundError(f"'{path}' is not a known key")
1149
+ if not self.dircache:
1150
+ self._dircache_from_items()
1151
+ out = self._ls_from_cache(path)
1152
+ if out is None:
1153
+ raise FileNotFoundError(path)
1154
+ if detail:
1155
+ return out
1156
+ return [o["name"] for o in out]
1157
+
1158
+ def exists(self, path, **kwargs): # overwrite auto-sync version
1159
+ return self.isdir(path) or self.isfile(path)
1160
+
1161
+ def isdir(self, path): # overwrite auto-sync version
1162
+ if self.dircache:
1163
+ return path in self.dircache
1164
+ elif isinstance(self.references, LazyReferenceMapper):
1165
+ return path in self.references.listdir()
1166
+ else:
1167
+ # this may be faster than building dircache for single calls, but
1168
+ # by looping will be slow for many calls; could cache it?
1169
+ return any(_.startswith(f"{path}/") for _ in self.references)
1170
+
1171
+ def isfile(self, path): # overwrite auto-sync version
1172
+ return path in self.references
1173
+
1174
+ async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
1175
+ return self.ls(path, detail, **kwargs)
1176
+
1177
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
1178
+ if withdirs:
1179
+ return super().find(
1180
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
1181
+ )
1182
+ if path:
1183
+ path = self._strip_protocol(path)
1184
+ r = sorted(k for k in self.references if k.startswith(path))
1185
+ else:
1186
+ r = sorted(self.references)
1187
+ if detail:
1188
+ if not self.dircache:
1189
+ self._dircache_from_items()
1190
+ return {k: self._ls_from_cache(k)[0] for k in r}
1191
+ else:
1192
+ return r
1193
+
1194
+ def info(self, path, **kwargs):
1195
+ out = self.references.get(path)
1196
+ if out is not None:
1197
+ if isinstance(out, (str, bytes)):
1198
+ # decode base64 here
1199
+ return {"name": path, "type": "file", "size": len(out)}
1200
+ elif len(out) > 1:
1201
+ return {"name": path, "type": "file", "size": out[2]}
1202
+ else:
1203
+ out0 = [{"name": path, "type": "file", "size": None}]
1204
+ else:
1205
+ out = self.ls(path, True)
1206
+ out0 = [o for o in out if o["name"] == path]
1207
+ if not out0:
1208
+ return {"name": path, "type": "directory", "size": 0}
1209
+ if out0[0]["size"] is None:
1210
+ # if this is a whole remote file, update size using remote FS
1211
+ prot, _ = split_protocol(self.references[path][0])
1212
+ out0[0]["size"] = self.fss[prot].size(self.references[path][0])
1213
+ return out0[0]
1214
+
1215
+ async def _info(self, path, **kwargs): # calls fast sync code
1216
+ return self.info(path)
1217
+
1218
+ async def _rm_file(self, path, **kwargs):
1219
+ self.references.pop(
1220
+ path, None
1221
+ ) # ignores FileNotFound, just as well for directories
1222
+ self.dircache.clear() # this is a bit heavy handed
1223
+
1224
+ async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
1225
+ if mode == "create" and self.exists(path):
1226
+ raise FileExistsError
1227
+ # can be str or bytes
1228
+ self.references[path] = data
1229
+ self.dircache.clear() # this is a bit heavy handed
1230
+
1231
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
1232
+ # puts binary
1233
+ if mode == "create" and self.exists(rpath):
1234
+ raise FileExistsError
1235
+ with open(lpath, "rb") as f:
1236
+ self.references[rpath] = f.read()
1237
+ self.dircache.clear() # this is a bit heavy handed
1238
+
1239
+ def save_json(self, url, **storage_options):
1240
+ """Write modified references into new location"""
1241
+ out = {}
1242
+ for k, v in self.references.items():
1243
+ if isinstance(v, bytes):
1244
+ try:
1245
+ out[k] = v.decode("ascii")
1246
+ except UnicodeDecodeError:
1247
+ out[k] = (b"base64:" + base64.b64encode(v)).decode()
1248
+ else:
1249
+ out[k] = v
1250
+ with fsspec.open(url, "wb", **storage_options) as f:
1251
+ f.write(json.dumps({"version": 1, "refs": out}).encode())
1252
+
1253
+
1254
+ class ReferenceFile(AbstractBufferedFile):
1255
+ def __init__(
1256
+ self,
1257
+ fs,
1258
+ path,
1259
+ mode="rb",
1260
+ block_size="default",
1261
+ autocommit=True,
1262
+ cache_type="readahead",
1263
+ cache_options=None,
1264
+ size=None,
1265
+ **kwargs,
1266
+ ):
1267
+ super().__init__(
1268
+ fs,
1269
+ path,
1270
+ mode=mode,
1271
+ block_size=block_size,
1272
+ autocommit=autocommit,
1273
+ size=size,
1274
+ cache_type=cache_type,
1275
+ cache_options=cache_options,
1276
+ **kwargs,
1277
+ )
1278
+ part_or_url, self.start, self.end = self.fs._cat_common(self.path)
1279
+ protocol, _ = split_protocol(part_or_url)
1280
+ self.src_fs = self.fs.fss[protocol]
1281
+ self.src_path = part_or_url
1282
+ self._f = None
1283
+
1284
+ @property
1285
+ def f(self):
1286
+ if self._f is None or self._f.closed:
1287
+ self._f = self.src_fs._open(
1288
+ self.src_path,
1289
+ mode=self.mode,
1290
+ block_size=self.blocksize,
1291
+ autocommit=self.autocommit,
1292
+ cache_type="none",
1293
+ **self.kwargs,
1294
+ )
1295
+ return self._f
1296
+
1297
+ def close(self):
1298
+ if self._f is not None:
1299
+ self._f.close()
1300
+ return super().close()
1301
+
1302
+ def _fetch_range(self, start, end):
1303
+ start = start + self.start
1304
+ end = min(end + self.start, self.end)
1305
+ self.f.seek(start)
1306
+ return self.f.read(end - start)
.venv/lib/python3.11/site-packages/fsspec/implementations/sftp.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import os
4
+ import types
5
+ import uuid
6
+ from stat import S_ISDIR, S_ISLNK
7
+
8
+ import paramiko
9
+
10
+ from .. import AbstractFileSystem
11
+ from ..utils import infer_storage_options
12
+
13
+ logger = logging.getLogger("fsspec.sftp")
14
+
15
+
16
+ class SFTPFileSystem(AbstractFileSystem):
17
+ """Files over SFTP/SSH
18
+
19
+ Peer-to-peer filesystem over SSH using paramiko.
20
+
21
+ Note: if using this with the ``open`` or ``open_files``, with full URLs,
22
+ there is no way to tell if a path is relative, so all paths are assumed
23
+ to be absolute.
24
+ """
25
+
26
+ protocol = "sftp", "ssh"
27
+
28
+ def __init__(self, host, **ssh_kwargs):
29
+ """
30
+
31
+ Parameters
32
+ ----------
33
+ host: str
34
+ Hostname or IP as a string
35
+ temppath: str
36
+ Location on the server to put files, when within a transaction
37
+ ssh_kwargs: dict
38
+ Parameters passed on to connection. See details in
39
+ https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
40
+ May include port, username, password...
41
+ """
42
+ if self._cached:
43
+ return
44
+ super().__init__(**ssh_kwargs)
45
+ self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
46
+ self.host = host
47
+ self.ssh_kwargs = ssh_kwargs
48
+ self._connect()
49
+
50
+ def _connect(self):
51
+ logger.debug("Connecting to SFTP server %s", self.host)
52
+ self.client = paramiko.SSHClient()
53
+ self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
54
+ self.client.connect(self.host, **self.ssh_kwargs)
55
+ self.ftp = self.client.open_sftp()
56
+
57
+ @classmethod
58
+ def _strip_protocol(cls, path):
59
+ return infer_storage_options(path)["path"]
60
+
61
+ @staticmethod
62
+ def _get_kwargs_from_urls(urlpath):
63
+ out = infer_storage_options(urlpath)
64
+ out.pop("path", None)
65
+ out.pop("protocol", None)
66
+ return out
67
+
68
+ def mkdir(self, path, create_parents=True, mode=511):
69
+ logger.debug("Creating folder %s", path)
70
+ if self.exists(path):
71
+ raise FileExistsError(f"File exists: {path}")
72
+
73
+ if create_parents:
74
+ self.makedirs(path)
75
+ else:
76
+ self.ftp.mkdir(path, mode)
77
+
78
+ def makedirs(self, path, exist_ok=False, mode=511):
79
+ if self.exists(path) and not exist_ok:
80
+ raise FileExistsError(f"File exists: {path}")
81
+
82
+ parts = path.split("/")
83
+ new_path = "/" if path[:1] == "/" else ""
84
+
85
+ for part in parts:
86
+ if part:
87
+ new_path = f"{new_path}/{part}" if new_path else part
88
+ if not self.exists(new_path):
89
+ self.ftp.mkdir(new_path, mode)
90
+
91
+ def rmdir(self, path):
92
+ logger.debug("Removing folder %s", path)
93
+ self.ftp.rmdir(path)
94
+
95
+ def info(self, path):
96
+ stat = self._decode_stat(self.ftp.stat(path))
97
+ stat["name"] = path
98
+ return stat
99
+
100
+ @staticmethod
101
+ def _decode_stat(stat, parent_path=None):
102
+ if S_ISDIR(stat.st_mode):
103
+ t = "directory"
104
+ elif S_ISLNK(stat.st_mode):
105
+ t = "link"
106
+ else:
107
+ t = "file"
108
+ out = {
109
+ "name": "",
110
+ "size": stat.st_size,
111
+ "type": t,
112
+ "uid": stat.st_uid,
113
+ "gid": stat.st_gid,
114
+ "time": datetime.datetime.fromtimestamp(
115
+ stat.st_atime, tz=datetime.timezone.utc
116
+ ),
117
+ "mtime": datetime.datetime.fromtimestamp(
118
+ stat.st_mtime, tz=datetime.timezone.utc
119
+ ),
120
+ }
121
+ if parent_path:
122
+ out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
123
+ return out
124
+
125
+ def ls(self, path, detail=False):
126
+ logger.debug("Listing folder %s", path)
127
+ stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
128
+ if detail:
129
+ return stats
130
+ else:
131
+ paths = [stat["name"] for stat in stats]
132
+ return sorted(paths)
133
+
134
+ def put(self, lpath, rpath, callback=None, **kwargs):
135
+ logger.debug("Put file %s into %s", lpath, rpath)
136
+ self.ftp.put(lpath, rpath)
137
+
138
+ def get_file(self, rpath, lpath, **kwargs):
139
+ if self.isdir(rpath):
140
+ os.makedirs(lpath, exist_ok=True)
141
+ else:
142
+ self.ftp.get(self._strip_protocol(rpath), lpath)
143
+
144
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
145
+ """
146
+ block_size: int or None
147
+ If 0, no buffering, if 1, line buffering, if >1, buffer that many
148
+ bytes, if None use default from paramiko.
149
+ """
150
+ logger.debug("Opening file %s", path)
151
+ if kwargs.get("autocommit", True) is False:
152
+ # writes to temporary file, move on commit
153
+ path2 = "/".join([self.temppath, str(uuid.uuid4())])
154
+ f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
155
+ f.temppath = path2
156
+ f.targetpath = path
157
+ f.fs = self
158
+ f.commit = types.MethodType(commit_a_file, f)
159
+ f.discard = types.MethodType(discard_a_file, f)
160
+ else:
161
+ f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
162
+ return f
163
+
164
+ def _rm(self, path):
165
+ if self.isdir(path):
166
+ self.ftp.rmdir(path)
167
+ else:
168
+ self.ftp.remove(path)
169
+
170
+ def mv(self, old, new):
171
+ logger.debug("Renaming %s into %s", old, new)
172
+ self.ftp.posix_rename(old, new)
173
+
174
+
175
+ def commit_a_file(self):
176
+ self.fs.mv(self.temppath, self.targetpath)
177
+
178
+
179
+ def discard_a_file(self):
180
+ self.fs._rm(self.temppath)
.venv/lib/python3.11/site-packages/fsspec/implementations/tar.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import tarfile
3
+
4
+ import fsspec
5
+ from fsspec.archive import AbstractArchiveFileSystem
6
+ from fsspec.compression import compr
7
+ from fsspec.utils import infer_compression
8
+
9
+ typemap = {b"0": "file", b"5": "directory"}
10
+
11
+ logger = logging.getLogger("tar")
12
+
13
+
14
+ class TarFileSystem(AbstractArchiveFileSystem):
15
+ """Compressed Tar archives as a file-system (read-only)
16
+
17
+ Supports the following formats:
18
+ tar.gz, tar.bz2, tar.xz
19
+ """
20
+
21
+ root_marker = ""
22
+ protocol = "tar"
23
+ cachable = False
24
+
25
+ def __init__(
26
+ self,
27
+ fo="",
28
+ index_store=None,
29
+ target_options=None,
30
+ target_protocol=None,
31
+ compression=None,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+ target_options = target_options or {}
36
+
37
+ if isinstance(fo, str):
38
+ self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
39
+ fo = self.of.open() # keep the reference
40
+
41
+ # Try to infer compression.
42
+ if compression is None:
43
+ name = None
44
+
45
+ # Try different ways to get hold of the filename. `fo` might either
46
+ # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
47
+ # `fsspec.AbstractFileSystem` instance.
48
+ try:
49
+ # Amended io.BufferedReader or similar.
50
+ # This uses a "protocol extension" where original filenames are
51
+ # propagated to archive-like filesystems in order to let them
52
+ # infer the right compression appropriately.
53
+ if hasattr(fo, "original"):
54
+ name = fo.original
55
+
56
+ # fsspec.LocalFileOpener
57
+ elif hasattr(fo, "path"):
58
+ name = fo.path
59
+
60
+ # io.BufferedReader
61
+ elif hasattr(fo, "name"):
62
+ name = fo.name
63
+
64
+ # fsspec.AbstractFileSystem
65
+ elif hasattr(fo, "info"):
66
+ name = fo.info()["name"]
67
+
68
+ except Exception as ex:
69
+ logger.warning(
70
+ f"Unable to determine file name, not inferring compression: {ex}"
71
+ )
72
+
73
+ if name is not None:
74
+ compression = infer_compression(name)
75
+ logger.info(f"Inferred compression {compression} from file name {name}")
76
+
77
+ if compression is not None:
78
+ # TODO: tarfile already implements compression with modes like "'r:gz'",
79
+ # but then would seek to offset in the file work?
80
+ fo = compr[compression](fo)
81
+
82
+ self._fo_ref = fo
83
+ self.fo = fo # the whole instance is a context
84
+ self.tar = tarfile.TarFile(fileobj=self.fo)
85
+ self.dir_cache = None
86
+
87
+ self.index_store = index_store
88
+ self.index = None
89
+ self._index()
90
+
91
+ def _index(self):
92
+ # TODO: load and set saved index, if exists
93
+ out = {}
94
+ for ti in self.tar:
95
+ info = ti.get_info()
96
+ info["type"] = typemap.get(info["type"], "file")
97
+ name = ti.get_info()["name"].rstrip("/")
98
+ out[name] = (info, ti.offset_data)
99
+
100
+ self.index = out
101
+ # TODO: save index to self.index_store here, if set
102
+
103
+ def _get_dirs(self):
104
+ if self.dir_cache is not None:
105
+ return
106
+
107
+ # This enables ls to get directories as children as well as files
108
+ self.dir_cache = {
109
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
110
+ for dirname in self._all_dirnames(self.tar.getnames())
111
+ }
112
+ for member in self.tar.getmembers():
113
+ info = member.get_info()
114
+ info["name"] = info["name"].rstrip("/")
115
+ info["type"] = typemap.get(info["type"], "file")
116
+ self.dir_cache[info["name"]] = info
117
+
118
+ def _open(self, path, mode="rb", **kwargs):
119
+ if mode != "rb":
120
+ raise ValueError("Read-only filesystem implementation")
121
+ details, offset = self.index[path]
122
+ if details["type"] != "file":
123
+ raise ValueError("Can only handle regular files")
124
+ return self.tar.extractfile(path)
.venv/lib/python3.11/site-packages/fsspec/implementations/webhdfs.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
2
+
3
+ import logging
4
+ import os
5
+ import secrets
6
+ import shutil
7
+ import tempfile
8
+ import uuid
9
+ from contextlib import suppress
10
+ from urllib.parse import quote
11
+
12
+ import requests
13
+
14
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
15
+ from ..utils import infer_storage_options, tokenize
16
+
17
+ logger = logging.getLogger("webhdfs")
18
+
19
+
20
+ class WebHDFS(AbstractFileSystem):
21
+ """
22
+ Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
23
+
24
+ Four auth mechanisms are supported:
25
+
26
+ insecure: no auth is done, and the user is assumed to be whoever they
27
+ say they are (parameter ``user``), or a predefined value such as
28
+ "dr.who" if not given
29
+ spnego: when kerberos authentication is enabled, auth is negotiated by
30
+ requests_kerberos https://github.com/requests/requests-kerberos .
31
+ This establishes a session based on existing kinit login and/or
32
+ specified principal/password; parameters are passed with ``kerb_kwargs``
33
+ token: uses an existing Hadoop delegation token from another secured
34
+ service. Indeed, this client can also generate such tokens when
35
+ not insecure. Note that tokens expire, but can be renewed (by a
36
+ previously specified user) and may allow for proxying.
37
+ basic-auth: used when both parameter ``user`` and parameter ``password``
38
+ are provided.
39
+
40
+ """
41
+
42
+ tempdir = str(tempfile.gettempdir())
43
+ protocol = "webhdfs", "webHDFS"
44
+
45
+ def __init__(
46
+ self,
47
+ host,
48
+ port=50070,
49
+ kerberos=False,
50
+ token=None,
51
+ user=None,
52
+ password=None,
53
+ proxy_to=None,
54
+ kerb_kwargs=None,
55
+ data_proxy=None,
56
+ use_https=False,
57
+ session_cert=None,
58
+ session_verify=True,
59
+ **kwargs,
60
+ ):
61
+ """
62
+ Parameters
63
+ ----------
64
+ host: str
65
+ Name-node address
66
+ port: int
67
+ Port for webHDFS
68
+ kerberos: bool
69
+ Whether to authenticate with kerberos for this connection
70
+ token: str or None
71
+ If given, use this token on every call to authenticate. A user
72
+ and user-proxy may be encoded in the token and should not be also
73
+ given
74
+ user: str or None
75
+ If given, assert the user name to connect with
76
+ password: str or None
77
+ If given, assert the password to use for basic auth. If password
78
+ is provided, user must be provided also
79
+ proxy_to: str or None
80
+ If given, the user has the authority to proxy, and this value is
81
+ the user in who's name actions are taken
82
+ kerb_kwargs: dict
83
+ Any extra arguments for HTTPKerberosAuth, see
84
+ `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
85
+ data_proxy: dict, callable or None
86
+ If given, map data-node addresses. This can be necessary if the
87
+ HDFS cluster is behind a proxy, running on Docker or otherwise has
88
+ a mismatch between the host-names given by the name-node and the
89
+ address by which to refer to them from the client. If a dict,
90
+ maps host names ``host->data_proxy[host]``; if a callable, full
91
+ URLs are passed, and function must conform to
92
+ ``url->data_proxy(url)``.
93
+ use_https: bool
94
+ Whether to connect to the Name-node using HTTPS instead of HTTP
95
+ session_cert: str or Tuple[str, str] or None
96
+ Path to a certificate file, or tuple of (cert, key) files to use
97
+ for the requests.Session
98
+ session_verify: str, bool or None
99
+ Path to a certificate file to use for verifying the requests.Session.
100
+ kwargs
101
+ """
102
+ if self._cached:
103
+ return
104
+ super().__init__(**kwargs)
105
+ self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
106
+ self.kerb = kerberos
107
+ self.kerb_kwargs = kerb_kwargs or {}
108
+ self.pars = {}
109
+ self.proxy = data_proxy or {}
110
+ if token is not None:
111
+ if user is not None or proxy_to is not None:
112
+ raise ValueError(
113
+ "If passing a delegation token, must not set "
114
+ "user or proxy_to, as these are encoded in the"
115
+ " token"
116
+ )
117
+ self.pars["delegation"] = token
118
+ self.user = user
119
+ self.password = password
120
+
121
+ if password is not None:
122
+ if user is None:
123
+ raise ValueError(
124
+ "If passing a password, the user must also be"
125
+ "set in order to set up the basic-auth"
126
+ )
127
+ else:
128
+ if user is not None:
129
+ self.pars["user.name"] = user
130
+
131
+ if proxy_to is not None:
132
+ self.pars["doas"] = proxy_to
133
+ if kerberos and user is not None:
134
+ raise ValueError(
135
+ "If using Kerberos auth, do not specify the "
136
+ "user, this is handled by kinit."
137
+ )
138
+
139
+ self.session_cert = session_cert
140
+ self.session_verify = session_verify
141
+
142
+ self._connect()
143
+
144
+ self._fsid = f"webhdfs_{tokenize(host, port)}"
145
+
146
+ @property
147
+ def fsid(self):
148
+ return self._fsid
149
+
150
+ def _connect(self):
151
+ self.session = requests.Session()
152
+
153
+ if self.session_cert:
154
+ self.session.cert = self.session_cert
155
+
156
+ self.session.verify = self.session_verify
157
+
158
+ if self.kerb:
159
+ from requests_kerberos import HTTPKerberosAuth
160
+
161
+ self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
162
+
163
+ if self.user is not None and self.password is not None:
164
+ from requests.auth import HTTPBasicAuth
165
+
166
+ self.session.auth = HTTPBasicAuth(self.user, self.password)
167
+
168
+ def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
169
+ path = self._strip_protocol(path) if path is not None else ""
170
+ url = self._apply_proxy(self.url + quote(path, safe="/="))
171
+ args = kwargs.copy()
172
+ args.update(self.pars)
173
+ args["op"] = op.upper()
174
+ logger.debug("sending %s with %s", url, method)
175
+ out = self.session.request(
176
+ method=method.upper(),
177
+ url=url,
178
+ params=args,
179
+ data=data,
180
+ allow_redirects=redirect,
181
+ )
182
+ if out.status_code in [400, 401, 403, 404, 500]:
183
+ try:
184
+ err = out.json()
185
+ msg = err["RemoteException"]["message"]
186
+ exp = err["RemoteException"]["exception"]
187
+ except (ValueError, KeyError):
188
+ pass
189
+ else:
190
+ if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
191
+ raise ValueError(msg)
192
+ elif exp in ["SecurityException", "AccessControlException"]:
193
+ raise PermissionError(msg)
194
+ elif exp in ["FileNotFoundException"]:
195
+ raise FileNotFoundError(msg)
196
+ else:
197
+ raise RuntimeError(msg)
198
+ out.raise_for_status()
199
+ return out
200
+
201
+ def _open(
202
+ self,
203
+ path,
204
+ mode="rb",
205
+ block_size=None,
206
+ autocommit=True,
207
+ replication=None,
208
+ permissions=None,
209
+ **kwargs,
210
+ ):
211
+ """
212
+
213
+ Parameters
214
+ ----------
215
+ path: str
216
+ File location
217
+ mode: str
218
+ 'rb', 'wb', etc.
219
+ block_size: int
220
+ Client buffer size for read-ahead or write buffer
221
+ autocommit: bool
222
+ If False, writes to temporary file that only gets put in final
223
+ location upon commit
224
+ replication: int
225
+ Number of copies of file on the cluster, write mode only
226
+ permissions: str or int
227
+ posix permissions, write mode only
228
+ kwargs
229
+
230
+ Returns
231
+ -------
232
+ WebHDFile instance
233
+ """
234
+ block_size = block_size or self.blocksize
235
+ return WebHDFile(
236
+ self,
237
+ path,
238
+ mode=mode,
239
+ block_size=block_size,
240
+ tempdir=self.tempdir,
241
+ autocommit=autocommit,
242
+ replication=replication,
243
+ permissions=permissions,
244
+ )
245
+
246
+ @staticmethod
247
+ def _process_info(info):
248
+ info["type"] = info["type"].lower()
249
+ info["size"] = info["length"]
250
+ return info
251
+
252
+ @classmethod
253
+ def _strip_protocol(cls, path):
254
+ return infer_storage_options(path)["path"]
255
+
256
+ @staticmethod
257
+ def _get_kwargs_from_urls(urlpath):
258
+ out = infer_storage_options(urlpath)
259
+ out.pop("path", None)
260
+ out.pop("protocol", None)
261
+ if "username" in out:
262
+ out["user"] = out.pop("username")
263
+ return out
264
+
265
+ def info(self, path):
266
+ out = self._call("GETFILESTATUS", path=path)
267
+ info = out.json()["FileStatus"]
268
+ info["name"] = path
269
+ return self._process_info(info)
270
+
271
+ def ls(self, path, detail=False):
272
+ out = self._call("LISTSTATUS", path=path)
273
+ infos = out.json()["FileStatuses"]["FileStatus"]
274
+ for info in infos:
275
+ self._process_info(info)
276
+ info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
277
+ if detail:
278
+ return sorted(infos, key=lambda i: i["name"])
279
+ else:
280
+ return sorted(info["name"] for info in infos)
281
+
282
+ def content_summary(self, path):
283
+ """Total numbers of files, directories and bytes under path"""
284
+ out = self._call("GETCONTENTSUMMARY", path=path)
285
+ return out.json()["ContentSummary"]
286
+
287
+ def ukey(self, path):
288
+ """Checksum info of file, giving method and result"""
289
+ out = self._call("GETFILECHECKSUM", path=path, redirect=False)
290
+ if "Location" in out.headers:
291
+ location = self._apply_proxy(out.headers["Location"])
292
+ out2 = self.session.get(location)
293
+ out2.raise_for_status()
294
+ return out2.json()["FileChecksum"]
295
+ else:
296
+ out.raise_for_status()
297
+ return out.json()["FileChecksum"]
298
+
299
+ def home_directory(self):
300
+ """Get user's home directory"""
301
+ out = self._call("GETHOMEDIRECTORY")
302
+ return out.json()["Path"]
303
+
304
+ def get_delegation_token(self, renewer=None):
305
+ """Retrieve token which can give the same authority to other uses
306
+
307
+ Parameters
308
+ ----------
309
+ renewer: str or None
310
+ User who may use this token; if None, will be current user
311
+ """
312
+ if renewer:
313
+ out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
314
+ else:
315
+ out = self._call("GETDELEGATIONTOKEN")
316
+ t = out.json()["Token"]
317
+ if t is None:
318
+ raise ValueError("No token available for this user/security context")
319
+ return t["urlString"]
320
+
321
+ def renew_delegation_token(self, token):
322
+ """Make token live longer. Returns new expiry time"""
323
+ out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
324
+ return out.json()["long"]
325
+
326
+ def cancel_delegation_token(self, token):
327
+ """Stop the token from being useful"""
328
+ self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
329
+
330
+ def chmod(self, path, mod):
331
+ """Set the permission at path
332
+
333
+ Parameters
334
+ ----------
335
+ path: str
336
+ location to set (file or directory)
337
+ mod: str or int
338
+ posix epresentation or permission, give as oct string, e.g, '777'
339
+ or 0o777
340
+ """
341
+ self._call("SETPERMISSION", method="put", path=path, permission=mod)
342
+
343
+ def chown(self, path, owner=None, group=None):
344
+ """Change owning user and/or group"""
345
+ kwargs = {}
346
+ if owner is not None:
347
+ kwargs["owner"] = owner
348
+ if group is not None:
349
+ kwargs["group"] = group
350
+ self._call("SETOWNER", method="put", path=path, **kwargs)
351
+
352
+ def set_replication(self, path, replication):
353
+ """
354
+ Set file replication factor
355
+
356
+ Parameters
357
+ ----------
358
+ path: str
359
+ File location (not for directories)
360
+ replication: int
361
+ Number of copies of file on the cluster. Should be smaller than
362
+ number of data nodes; normally 3 on most systems.
363
+ """
364
+ self._call("SETREPLICATION", path=path, method="put", replication=replication)
365
+
366
+ def mkdir(self, path, **kwargs):
367
+ self._call("MKDIRS", method="put", path=path)
368
+
369
+ def makedirs(self, path, exist_ok=False):
370
+ if exist_ok is False and self.exists(path):
371
+ raise FileExistsError(path)
372
+ self.mkdir(path)
373
+
374
+ def mv(self, path1, path2, **kwargs):
375
+ self._call("RENAME", method="put", path=path1, destination=path2)
376
+
377
+ def rm(self, path, recursive=False, **kwargs):
378
+ self._call(
379
+ "DELETE",
380
+ method="delete",
381
+ path=path,
382
+ recursive="true" if recursive else "false",
383
+ )
384
+
385
+ def rm_file(self, path, **kwargs):
386
+ self.rm(path)
387
+
388
+ def cp_file(self, lpath, rpath, **kwargs):
389
+ with self.open(lpath) as lstream:
390
+ tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
391
+ # Perform an atomic copy (stream to a temporary file and
392
+ # move it to the actual destination).
393
+ try:
394
+ with self.open(tmp_fname, "wb") as rstream:
395
+ shutil.copyfileobj(lstream, rstream)
396
+ self.mv(tmp_fname, rpath)
397
+ except BaseException:
398
+ with suppress(FileNotFoundError):
399
+ self.rm(tmp_fname)
400
+ raise
401
+
402
+ def _apply_proxy(self, location):
403
+ if self.proxy and callable(self.proxy):
404
+ location = self.proxy(location)
405
+ elif self.proxy:
406
+ # as a dict
407
+ for k, v in self.proxy.items():
408
+ location = location.replace(k, v, 1)
409
+ return location
410
+
411
+
412
+ class WebHDFile(AbstractBufferedFile):
413
+ """A file living in HDFS over webHDFS"""
414
+
415
+ def __init__(self, fs, path, **kwargs):
416
+ super().__init__(fs, path, **kwargs)
417
+ kwargs = kwargs.copy()
418
+ if kwargs.get("permissions", None) is None:
419
+ kwargs.pop("permissions", None)
420
+ if kwargs.get("replication", None) is None:
421
+ kwargs.pop("replication", None)
422
+ self.permissions = kwargs.pop("permissions", 511)
423
+ tempdir = kwargs.pop("tempdir")
424
+ if kwargs.pop("autocommit", False) is False:
425
+ self.target = self.path
426
+ self.path = os.path.join(tempdir, str(uuid.uuid4()))
427
+
428
+ def _upload_chunk(self, final=False):
429
+ """Write one part of a multi-block file upload
430
+
431
+ Parameters
432
+ ==========
433
+ final: bool
434
+ This is the last block, so should complete file, if
435
+ self.autocommit is True.
436
+ """
437
+ out = self.fs.session.post(
438
+ self.location,
439
+ data=self.buffer.getvalue(),
440
+ headers={"content-type": "application/octet-stream"},
441
+ )
442
+ out.raise_for_status()
443
+ return True
444
+
445
+ def _initiate_upload(self):
446
+ """Create remote file/upload"""
447
+ kwargs = self.kwargs.copy()
448
+ if "a" in self.mode:
449
+ op, method = "APPEND", "POST"
450
+ else:
451
+ op, method = "CREATE", "PUT"
452
+ kwargs["overwrite"] = "true"
453
+ out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
454
+ location = self.fs._apply_proxy(out.headers["Location"])
455
+ if "w" in self.mode:
456
+ # create empty file to append to
457
+ out2 = self.fs.session.put(
458
+ location, headers={"content-type": "application/octet-stream"}
459
+ )
460
+ out2.raise_for_status()
461
+ # after creating empty file, change location to append to
462
+ out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
463
+ self.location = self.fs._apply_proxy(out2.headers["Location"])
464
+
465
+ def _fetch_range(self, start, end):
466
+ start = max(start, 0)
467
+ end = min(self.size, end)
468
+ if start >= end or start >= self.size:
469
+ return b""
470
+ out = self.fs._call(
471
+ "OPEN", path=self.path, offset=start, length=end - start, redirect=False
472
+ )
473
+ out.raise_for_status()
474
+ if "Location" in out.headers:
475
+ location = out.headers["Location"]
476
+ out2 = self.fs.session.get(self.fs._apply_proxy(location))
477
+ return out2.content
478
+ else:
479
+ return out.content
480
+
481
+ def commit(self):
482
+ self.fs.mv(self.path, self.target)
483
+
484
+ def discard(self):
485
+ self.fs.rm(self.path)
.venv/lib/python3.11/site-packages/fsspec/json.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from contextlib import suppress
3
+ from pathlib import PurePath
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ ClassVar,
8
+ Dict,
9
+ List,
10
+ Mapping,
11
+ Optional,
12
+ Sequence,
13
+ Tuple,
14
+ )
15
+
16
+ from .registry import _import_class, get_filesystem_class
17
+ from .spec import AbstractFileSystem
18
+
19
+
20
+ class FilesystemJSONEncoder(json.JSONEncoder):
21
+ include_password: ClassVar[bool] = True
22
+
23
+ def default(self, o: Any) -> Any:
24
+ if isinstance(o, AbstractFileSystem):
25
+ return o.to_dict(include_password=self.include_password)
26
+ if isinstance(o, PurePath):
27
+ cls = type(o)
28
+ return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
29
+
30
+ return super().default(o)
31
+
32
+ def make_serializable(self, obj: Any) -> Any:
33
+ """
34
+ Recursively converts an object so that it can be JSON serialized via
35
+ :func:`json.dumps` and :func:`json.dump`, without actually calling
36
+ said functions.
37
+ """
38
+ if isinstance(obj, (str, int, float, bool)):
39
+ return obj
40
+ if isinstance(obj, Mapping):
41
+ return {k: self.make_serializable(v) for k, v in obj.items()}
42
+ if isinstance(obj, Sequence):
43
+ return [self.make_serializable(v) for v in obj]
44
+
45
+ return self.default(obj)
46
+
47
+
48
+ class FilesystemJSONDecoder(json.JSONDecoder):
49
+ def __init__(
50
+ self,
51
+ *,
52
+ object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
53
+ parse_float: Optional[Callable[[str], Any]] = None,
54
+ parse_int: Optional[Callable[[str], Any]] = None,
55
+ parse_constant: Optional[Callable[[str], Any]] = None,
56
+ strict: bool = True,
57
+ object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
58
+ ) -> None:
59
+ self.original_object_hook = object_hook
60
+
61
+ super().__init__(
62
+ object_hook=self.custom_object_hook,
63
+ parse_float=parse_float,
64
+ parse_int=parse_int,
65
+ parse_constant=parse_constant,
66
+ strict=strict,
67
+ object_pairs_hook=object_pairs_hook,
68
+ )
69
+
70
+ @classmethod
71
+ def try_resolve_path_cls(cls, dct: Dict[str, Any]):
72
+ with suppress(Exception):
73
+ fqp = dct["cls"]
74
+
75
+ path_cls = _import_class(fqp)
76
+
77
+ if issubclass(path_cls, PurePath):
78
+ return path_cls
79
+
80
+ return None
81
+
82
+ @classmethod
83
+ def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
84
+ with suppress(Exception):
85
+ if "cls" in dct:
86
+ try:
87
+ fs_cls = _import_class(dct["cls"])
88
+ if issubclass(fs_cls, AbstractFileSystem):
89
+ return fs_cls
90
+ except Exception:
91
+ if "protocol" in dct: # Fallback if cls cannot be imported
92
+ return get_filesystem_class(dct["protocol"])
93
+
94
+ raise
95
+
96
+ return None
97
+
98
+ def custom_object_hook(self, dct: Dict[str, Any]):
99
+ if "cls" in dct:
100
+ if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
101
+ return AbstractFileSystem.from_dict(dct)
102
+ if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
103
+ return obj_cls(dct["str"])
104
+
105
+ if self.original_object_hook is not None:
106
+ return self.original_object_hook(dct)
107
+
108
+ return dct
109
+
110
+ def unmake_serializable(self, obj: Any) -> Any:
111
+ """
112
+ Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
113
+ """
114
+ if isinstance(obj, dict):
115
+ obj = self.custom_object_hook(obj)
116
+ if isinstance(obj, dict):
117
+ return {k: self.unmake_serializable(v) for k, v in obj.items()}
118
+ if isinstance(obj, (list, tuple)):
119
+ return [self.unmake_serializable(v) for v in obj]
120
+
121
+ return obj
.venv/lib/python3.11/site-packages/fsspec/mapping.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import array
2
+ import logging
3
+ import posixpath
4
+ import warnings
5
+ from collections.abc import MutableMapping
6
+ from functools import cached_property
7
+
8
+ from fsspec.core import url_to_fs
9
+
10
+ logger = logging.getLogger("fsspec.mapping")
11
+
12
+
13
+ class FSMap(MutableMapping):
14
+ """Wrap a FileSystem instance as a mutable wrapping.
15
+
16
+ The keys of the mapping become files under the given root, and the
17
+ values (which must be bytes) the contents of those files.
18
+
19
+ Parameters
20
+ ----------
21
+ root: string
22
+ prefix for all the files
23
+ fs: FileSystem instance
24
+ check: bool (=True)
25
+ performs a touch at the location, to check for write access.
26
+
27
+ Examples
28
+ --------
29
+ >>> fs = FileSystem(**parameters) # doctest: +SKIP
30
+ >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
31
+ or, more likely
32
+ >>> d = fs.get_mapper('my-data/path/')
33
+
34
+ >>> d['loc1'] = b'Hello World' # doctest: +SKIP
35
+ >>> list(d.keys()) # doctest: +SKIP
36
+ ['loc1']
37
+ >>> d['loc1'] # doctest: +SKIP
38
+ b'Hello World'
39
+ """
40
+
41
+ def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
42
+ self.fs = fs
43
+ self.root = fs._strip_protocol(root)
44
+ self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
45
+ if missing_exceptions is None:
46
+ missing_exceptions = (
47
+ FileNotFoundError,
48
+ IsADirectoryError,
49
+ NotADirectoryError,
50
+ )
51
+ self.missing_exceptions = missing_exceptions
52
+ self.check = check
53
+ self.create = create
54
+ if create:
55
+ if not self.fs.exists(root):
56
+ self.fs.mkdir(root)
57
+ if check:
58
+ if not self.fs.exists(root):
59
+ raise ValueError(
60
+ f"Path {root} does not exist. Create "
61
+ f" with the ``create=True`` keyword"
62
+ )
63
+ self.fs.touch(root + "/a")
64
+ self.fs.rm(root + "/a")
65
+
66
+ @cached_property
67
+ def dirfs(self):
68
+ """dirfs instance that can be used with the same keys as the mapper"""
69
+ from .implementations.dirfs import DirFileSystem
70
+
71
+ return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
72
+
73
+ def clear(self):
74
+ """Remove all keys below root - empties out mapping"""
75
+ logger.info("Clear mapping at %s", self.root)
76
+ try:
77
+ self.fs.rm(self.root, True)
78
+ self.fs.mkdir(self.root)
79
+ except: # noqa: E722
80
+ pass
81
+
82
+ def getitems(self, keys, on_error="raise"):
83
+ """Fetch multiple items from the store
84
+
85
+ If the backend is async-able, this might proceed concurrently
86
+
87
+ Parameters
88
+ ----------
89
+ keys: list(str)
90
+ They keys to be fetched
91
+ on_error : "raise", "omit", "return"
92
+ If raise, an underlying exception will be raised (converted to KeyError
93
+ if the type is in self.missing_exceptions); if omit, keys with exception
94
+ will simply not be included in the output; if "return", all keys are
95
+ included in the output, but the value will be bytes or an exception
96
+ instance.
97
+
98
+ Returns
99
+ -------
100
+ dict(key, bytes|exception)
101
+ """
102
+ keys2 = [self._key_to_str(k) for k in keys]
103
+ oe = on_error if on_error == "raise" else "return"
104
+ try:
105
+ out = self.fs.cat(keys2, on_error=oe)
106
+ if isinstance(out, bytes):
107
+ out = {keys2[0]: out}
108
+ except self.missing_exceptions as e:
109
+ raise KeyError from e
110
+ out = {
111
+ k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
112
+ for k, v in out.items()
113
+ }
114
+ return {
115
+ key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
116
+ for key, k2 in zip(keys, keys2)
117
+ if on_error == "return" or not isinstance(out[k2], BaseException)
118
+ }
119
+
120
+ def setitems(self, values_dict):
121
+ """Set the values of multiple items in the store
122
+
123
+ Parameters
124
+ ----------
125
+ values_dict: dict(str, bytes)
126
+ """
127
+ values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
128
+ self.fs.pipe(values)
129
+
130
+ def delitems(self, keys):
131
+ """Remove multiple keys from the store"""
132
+ self.fs.rm([self._key_to_str(k) for k in keys])
133
+
134
+ def _key_to_str(self, key):
135
+ """Generate full path for the key"""
136
+ if not isinstance(key, str):
137
+ # raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
138
+ warnings.warn(
139
+ "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
140
+ DeprecationWarning,
141
+ )
142
+ if isinstance(key, list):
143
+ key = tuple(key)
144
+ key = str(key)
145
+ return f"{self._root_key_to_str}{key}".rstrip("/")
146
+
147
+ def _str_to_key(self, s):
148
+ """Strip path of to leave key name"""
149
+ return s[len(self.root) :].lstrip("/")
150
+
151
+ def __getitem__(self, key, default=None):
152
+ """Retrieve data"""
153
+ k = self._key_to_str(key)
154
+ try:
155
+ result = self.fs.cat(k)
156
+ except self.missing_exceptions as exc:
157
+ if default is not None:
158
+ return default
159
+ raise KeyError(key) from exc
160
+ return result
161
+
162
+ def pop(self, key, default=None):
163
+ """Pop data"""
164
+ result = self.__getitem__(key, default)
165
+ try:
166
+ del self[key]
167
+ except KeyError:
168
+ pass
169
+ return result
170
+
171
+ def __setitem__(self, key, value):
172
+ """Store value in key"""
173
+ key = self._key_to_str(key)
174
+ self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
175
+ self.fs.pipe_file(key, maybe_convert(value))
176
+
177
+ def __iter__(self):
178
+ return (self._str_to_key(x) for x in self.fs.find(self.root))
179
+
180
+ def __len__(self):
181
+ return len(self.fs.find(self.root))
182
+
183
+ def __delitem__(self, key):
184
+ """Remove key"""
185
+ try:
186
+ self.fs.rm(self._key_to_str(key))
187
+ except Exception as exc:
188
+ raise KeyError from exc
189
+
190
+ def __contains__(self, key):
191
+ """Does key exist in mapping?"""
192
+ path = self._key_to_str(key)
193
+ return self.fs.isfile(path)
194
+
195
+ def __reduce__(self):
196
+ return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
197
+
198
+
199
+ def maybe_convert(value):
200
+ if isinstance(value, array.array) or hasattr(value, "__array__"):
201
+ # bytes-like things
202
+ if hasattr(value, "dtype") and value.dtype.kind in "Mm":
203
+ # The buffer interface doesn't support datetime64/timdelta64 numpy
204
+ # arrays
205
+ value = value.view("int64")
206
+ value = bytes(memoryview(value))
207
+ return value
208
+
209
+
210
+ def get_mapper(
211
+ url="",
212
+ check=False,
213
+ create=False,
214
+ missing_exceptions=None,
215
+ alternate_root=None,
216
+ **kwargs,
217
+ ):
218
+ """Create key-value interface for given URL and options
219
+
220
+ The URL will be of the form "protocol://location" and point to the root
221
+ of the mapper required. All keys will be file-names below this location,
222
+ and their values the contents of each key.
223
+
224
+ Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
225
+
226
+ Parameters
227
+ ----------
228
+ url: str
229
+ Root URL of mapping
230
+ check: bool
231
+ Whether to attempt to read from the location before instantiation, to
232
+ check that the mapping does exist
233
+ create: bool
234
+ Whether to make the directory corresponding to the root before
235
+ instantiating
236
+ missing_exceptions: None or tuple
237
+ If given, these exception types will be regarded as missing keys and
238
+ return KeyError when trying to read data. By default, you get
239
+ (FileNotFoundError, IsADirectoryError, NotADirectoryError)
240
+ alternate_root: None or str
241
+ In cases of complex URLs, the parser may fail to pick the correct part
242
+ for the mapper root, so this arg can override
243
+
244
+ Returns
245
+ -------
246
+ ``FSMap`` instance, the dict-like key-value store.
247
+ """
248
+ # Removing protocol here - could defer to each open() on the backend
249
+ fs, urlpath = url_to_fs(url, **kwargs)
250
+ root = alternate_root if alternate_root is not None else urlpath
251
+ return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
.venv/lib/python3.11/site-packages/fsspec/parquet.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import warnings
4
+
5
+ from .core import url_to_fs
6
+ from .utils import merge_offset_ranges
7
+
8
+ # Parquet-Specific Utilities for fsspec
9
+ #
10
+ # Most of the functions defined in this module are NOT
11
+ # intended for public consumption. The only exception
12
+ # to this is `open_parquet_file`, which should be used
13
+ # place of `fs.open()` to open parquet-formatted files
14
+ # on remote file systems.
15
+
16
+
17
+ def open_parquet_file(
18
+ path,
19
+ mode="rb",
20
+ fs=None,
21
+ metadata=None,
22
+ columns=None,
23
+ row_groups=None,
24
+ storage_options=None,
25
+ strict=False,
26
+ engine="auto",
27
+ max_gap=64_000,
28
+ max_block=256_000_000,
29
+ footer_sample_size=1_000_000,
30
+ **kwargs,
31
+ ):
32
+ """
33
+ Return a file-like object for a single Parquet file.
34
+
35
+ The specified parquet `engine` will be used to parse the
36
+ footer metadata, and determine the required byte ranges
37
+ from the file. The target path will then be opened with
38
+ the "parts" (`KnownPartsOfAFile`) caching strategy.
39
+
40
+ Note that this method is intended for usage with remote
41
+ file systems, and is unlikely to improve parquet-read
42
+ performance on local file systems.
43
+
44
+ Parameters
45
+ ----------
46
+ path: str
47
+ Target file path.
48
+ mode: str, optional
49
+ Mode option to be passed through to `fs.open`. Default is "rb".
50
+ metadata: Any, optional
51
+ Parquet metadata object. Object type must be supported
52
+ by the backend parquet engine. For now, only the "fastparquet"
53
+ engine supports an explicit `ParquetFile` metadata object.
54
+ If a metadata object is supplied, the remote footer metadata
55
+ will not need to be transferred into local memory.
56
+ fs: AbstractFileSystem, optional
57
+ Filesystem object to use for opening the file. If nothing is
58
+ specified, an `AbstractFileSystem` object will be inferred.
59
+ engine : str, default "auto"
60
+ Parquet engine to use for metadata parsing. Allowed options
61
+ include "fastparquet", "pyarrow", and "auto". The specified
62
+ engine must be installed in the current environment. If
63
+ "auto" is specified, and both engines are installed,
64
+ "fastparquet" will take precedence over "pyarrow".
65
+ columns: list, optional
66
+ List of all column names that may be read from the file.
67
+ row_groups : list, optional
68
+ List of all row-groups that may be read from the file. This
69
+ may be a list of row-group indices (integers), or it may be
70
+ a list of `RowGroup` metadata objects (if the "fastparquet"
71
+ engine is used).
72
+ storage_options : dict, optional
73
+ Used to generate an `AbstractFileSystem` object if `fs` was
74
+ not specified.
75
+ strict : bool, optional
76
+ Whether the resulting `KnownPartsOfAFile` cache should
77
+ fetch reads that go beyond a known byte-range boundary.
78
+ If `False` (the default), any read that ends outside a
79
+ known part will be zero padded. Note that using
80
+ `strict=True` may be useful for debugging.
81
+ max_gap : int, optional
82
+ Neighboring byte ranges will only be merged when their
83
+ inter-range gap is <= `max_gap`. Default is 64KB.
84
+ max_block : int, optional
85
+ Neighboring byte ranges will only be merged when the size of
86
+ the aggregated range is <= `max_block`. Default is 256MB.
87
+ footer_sample_size : int, optional
88
+ Number of bytes to read from the end of the path to look
89
+ for the footer metadata. If the sampled bytes do not contain
90
+ the footer, a second read request will be required, and
91
+ performance will suffer. Default is 1MB.
92
+ **kwargs :
93
+ Optional key-word arguments to pass to `fs.open`
94
+ """
95
+
96
+ # Make sure we have an `AbstractFileSystem` object
97
+ # to work with
98
+ if fs is None:
99
+ fs = url_to_fs(path, **(storage_options or {}))[0]
100
+
101
+ # For now, `columns == []` not supported. Just use
102
+ # default `open` command with `path` input
103
+ if columns is not None and len(columns) == 0:
104
+ return fs.open(path, mode=mode)
105
+
106
+ # Set the engine
107
+ engine = _set_engine(engine)
108
+
109
+ # Fetch the known byte ranges needed to read
110
+ # `columns` and/or `row_groups`
111
+ data = _get_parquet_byte_ranges(
112
+ [path],
113
+ fs,
114
+ metadata=metadata,
115
+ columns=columns,
116
+ row_groups=row_groups,
117
+ engine=engine,
118
+ max_gap=max_gap,
119
+ max_block=max_block,
120
+ footer_sample_size=footer_sample_size,
121
+ )
122
+
123
+ # Extract file name from `data`
124
+ fn = next(iter(data)) if data else path
125
+
126
+ # Call self.open with "parts" caching
127
+ options = kwargs.pop("cache_options", {}).copy()
128
+ return fs.open(
129
+ fn,
130
+ mode=mode,
131
+ cache_type="parts",
132
+ cache_options={
133
+ **options,
134
+ "data": data.get(fn, {}),
135
+ "strict": strict,
136
+ },
137
+ **kwargs,
138
+ )
139
+
140
+
141
+ def _get_parquet_byte_ranges(
142
+ paths,
143
+ fs,
144
+ metadata=None,
145
+ columns=None,
146
+ row_groups=None,
147
+ max_gap=64_000,
148
+ max_block=256_000_000,
149
+ footer_sample_size=1_000_000,
150
+ engine="auto",
151
+ ):
152
+ """Get a dictionary of the known byte ranges needed
153
+ to read a specific column/row-group selection from a
154
+ Parquet dataset. Each value in the output dictionary
155
+ is intended for use as the `data` argument for the
156
+ `KnownPartsOfAFile` caching strategy of a single path.
157
+ """
158
+
159
+ # Set engine if necessary
160
+ if isinstance(engine, str):
161
+ engine = _set_engine(engine)
162
+
163
+ # Pass to specialized function if metadata is defined
164
+ if metadata is not None:
165
+ # Use the provided parquet metadata object
166
+ # to avoid transferring/parsing footer metadata
167
+ return _get_parquet_byte_ranges_from_metadata(
168
+ metadata,
169
+ fs,
170
+ engine,
171
+ columns=columns,
172
+ row_groups=row_groups,
173
+ max_gap=max_gap,
174
+ max_block=max_block,
175
+ )
176
+
177
+ # Get file sizes asynchronously
178
+ file_sizes = fs.sizes(paths)
179
+
180
+ # Populate global paths, starts, & ends
181
+ result = {}
182
+ data_paths = []
183
+ data_starts = []
184
+ data_ends = []
185
+ add_header_magic = True
186
+ if columns is None and row_groups is None:
187
+ # We are NOT selecting specific columns or row-groups.
188
+ #
189
+ # We can avoid sampling the footers, and just transfer
190
+ # all file data with cat_ranges
191
+ for i, path in enumerate(paths):
192
+ result[path] = {}
193
+ for b in range(0, file_sizes[i], max_block):
194
+ data_paths.append(path)
195
+ data_starts.append(b)
196
+ data_ends.append(min(b + max_block, file_sizes[i]))
197
+ add_header_magic = False # "Magic" should already be included
198
+ else:
199
+ # We ARE selecting specific columns or row-groups.
200
+ #
201
+ # Gather file footers.
202
+ # We just take the last `footer_sample_size` bytes of each
203
+ # file (or the entire file if it is smaller than that)
204
+ footer_starts = []
205
+ footer_ends = []
206
+ for i, path in enumerate(paths):
207
+ footer_ends.append(file_sizes[i])
208
+ sample_size = max(0, file_sizes[i] - footer_sample_size)
209
+ footer_starts.append(sample_size)
210
+ footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
211
+
212
+ # Check our footer samples and re-sample if necessary.
213
+ missing_footer_starts = footer_starts.copy()
214
+ large_footer = 0
215
+ for i, path in enumerate(paths):
216
+ footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
217
+ real_footer_start = file_sizes[i] - (footer_size + 8)
218
+ if real_footer_start < footer_starts[i]:
219
+ missing_footer_starts[i] = real_footer_start
220
+ large_footer = max(large_footer, (footer_size + 8))
221
+ if large_footer:
222
+ warnings.warn(
223
+ f"Not enough data was used to sample the parquet footer. "
224
+ f"Try setting footer_sample_size >= {large_footer}."
225
+ )
226
+ for i, block in enumerate(
227
+ fs.cat_ranges(
228
+ paths,
229
+ missing_footer_starts,
230
+ footer_starts,
231
+ )
232
+ ):
233
+ footer_samples[i] = block + footer_samples[i]
234
+ footer_starts[i] = missing_footer_starts[i]
235
+
236
+ # Calculate required byte ranges for each path
237
+ for i, path in enumerate(paths):
238
+ # Deal with small-file case.
239
+ # Just include all remaining bytes of the file
240
+ # in a single range.
241
+ if file_sizes[i] < max_block:
242
+ if footer_starts[i] > 0:
243
+ # Only need to transfer the data if the
244
+ # footer sample isn't already the whole file
245
+ data_paths.append(path)
246
+ data_starts.append(0)
247
+ data_ends.append(footer_starts[i])
248
+ continue
249
+
250
+ # Use "engine" to collect data byte ranges
251
+ path_data_starts, path_data_ends = engine._parquet_byte_ranges(
252
+ columns,
253
+ row_groups=row_groups,
254
+ footer=footer_samples[i],
255
+ footer_start=footer_starts[i],
256
+ )
257
+
258
+ data_paths += [path] * len(path_data_starts)
259
+ data_starts += path_data_starts
260
+ data_ends += path_data_ends
261
+
262
+ # Merge adjacent offset ranges
263
+ data_paths, data_starts, data_ends = merge_offset_ranges(
264
+ data_paths,
265
+ data_starts,
266
+ data_ends,
267
+ max_gap=max_gap,
268
+ max_block=max_block,
269
+ sort=False, # Should already be sorted
270
+ )
271
+
272
+ # Start by populating `result` with footer samples
273
+ for i, path in enumerate(paths):
274
+ result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
275
+
276
+ # Transfer the data byte-ranges into local memory
277
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
278
+
279
+ # Add b"PAR1" to header if necessary
280
+ if add_header_magic:
281
+ _add_header_magic(result)
282
+
283
+ return result
284
+
285
+
286
+ def _get_parquet_byte_ranges_from_metadata(
287
+ metadata,
288
+ fs,
289
+ engine,
290
+ columns=None,
291
+ row_groups=None,
292
+ max_gap=64_000,
293
+ max_block=256_000_000,
294
+ ):
295
+ """Simplified version of `_get_parquet_byte_ranges` for
296
+ the case that an engine-specific `metadata` object is
297
+ provided, and the remote footer metadata does not need to
298
+ be transferred before calculating the required byte ranges.
299
+ """
300
+
301
+ # Use "engine" to collect data byte ranges
302
+ data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
303
+ columns,
304
+ row_groups=row_groups,
305
+ metadata=metadata,
306
+ )
307
+
308
+ # Merge adjacent offset ranges
309
+ data_paths, data_starts, data_ends = merge_offset_ranges(
310
+ data_paths,
311
+ data_starts,
312
+ data_ends,
313
+ max_gap=max_gap,
314
+ max_block=max_block,
315
+ sort=False, # Should be sorted
316
+ )
317
+
318
+ # Transfer the data byte-ranges into local memory
319
+ result = {fn: {} for fn in list(set(data_paths))}
320
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
321
+
322
+ # Add b"PAR1" to header
323
+ _add_header_magic(result)
324
+
325
+ return result
326
+
327
+
328
+ def _transfer_ranges(fs, blocks, paths, starts, ends):
329
+ # Use cat_ranges to gather the data byte_ranges
330
+ ranges = (paths, starts, ends)
331
+ for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
332
+ blocks[path][(start, stop)] = data
333
+
334
+
335
+ def _add_header_magic(data):
336
+ # Add b"PAR1" to file headers
337
+ for path in list(data.keys()):
338
+ add_magic = True
339
+ for k in data[path]:
340
+ if k[0] == 0 and k[1] >= 4:
341
+ add_magic = False
342
+ break
343
+ if add_magic:
344
+ data[path][(0, 4)] = b"PAR1"
345
+
346
+
347
+ def _set_engine(engine_str):
348
+ # Define a list of parquet engines to try
349
+ if engine_str == "auto":
350
+ try_engines = ("fastparquet", "pyarrow")
351
+ elif not isinstance(engine_str, str):
352
+ raise ValueError(
353
+ "Failed to set parquet engine! "
354
+ "Please pass 'fastparquet', 'pyarrow', or 'auto'"
355
+ )
356
+ elif engine_str not in ("fastparquet", "pyarrow"):
357
+ raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
358
+ else:
359
+ try_engines = [engine_str]
360
+
361
+ # Try importing the engines in `try_engines`,
362
+ # and choose the first one that succeeds
363
+ for engine in try_engines:
364
+ try:
365
+ if engine == "fastparquet":
366
+ return FastparquetEngine()
367
+ elif engine == "pyarrow":
368
+ return PyarrowEngine()
369
+ except ImportError:
370
+ pass
371
+
372
+ # Raise an error if a supported parquet engine
373
+ # was not found
374
+ raise ImportError(
375
+ f"The following parquet engines are not installed "
376
+ f"in your python environment: {try_engines}."
377
+ f"Please install 'fastparquert' or 'pyarrow' to "
378
+ f"utilize the `fsspec.parquet` module."
379
+ )
380
+
381
+
382
+ class FastparquetEngine:
383
+ # The purpose of the FastparquetEngine class is
384
+ # to check if fastparquet can be imported (on initialization)
385
+ # and to define a `_parquet_byte_ranges` method. In the
386
+ # future, this class may also be used to define other
387
+ # methods/logic that are specific to fastparquet.
388
+
389
+ def __init__(self):
390
+ import fastparquet as fp
391
+
392
+ self.fp = fp
393
+
394
+ def _row_group_filename(self, row_group, pf):
395
+ return pf.row_group_filename(row_group)
396
+
397
+ def _parquet_byte_ranges(
398
+ self,
399
+ columns,
400
+ row_groups=None,
401
+ metadata=None,
402
+ footer=None,
403
+ footer_start=None,
404
+ ):
405
+ # Initialize offset ranges and define ParqetFile metadata
406
+ pf = metadata
407
+ data_paths, data_starts, data_ends = [], [], []
408
+ if pf is None:
409
+ pf = self.fp.ParquetFile(io.BytesIO(footer))
410
+
411
+ # Convert columns to a set and add any index columns
412
+ # specified in the pandas metadata (just in case)
413
+ column_set = None if columns is None else set(columns)
414
+ if column_set is not None and hasattr(pf, "pandas_metadata"):
415
+ md_index = [
416
+ ind
417
+ for ind in pf.pandas_metadata.get("index_columns", [])
418
+ # Ignore RangeIndex information
419
+ if not isinstance(ind, dict)
420
+ ]
421
+ column_set |= set(md_index)
422
+
423
+ # Check if row_groups is a list of integers
424
+ # or a list of row-group metadata
425
+ if row_groups and not isinstance(row_groups[0], int):
426
+ # Input row_groups contains row-group metadata
427
+ row_group_indices = None
428
+ else:
429
+ # Input row_groups contains row-group indices
430
+ row_group_indices = row_groups
431
+ row_groups = pf.row_groups
432
+
433
+ # Loop through column chunks to add required byte ranges
434
+ for r, row_group in enumerate(row_groups):
435
+ # Skip this row-group if we are targeting
436
+ # specific row-groups
437
+ if row_group_indices is None or r in row_group_indices:
438
+ # Find the target parquet-file path for `row_group`
439
+ fn = self._row_group_filename(row_group, pf)
440
+
441
+ for column in row_group.columns:
442
+ name = column.meta_data.path_in_schema[0]
443
+ # Skip this column if we are targeting a
444
+ # specific columns
445
+ if column_set is None or name in column_set:
446
+ file_offset0 = column.meta_data.dictionary_page_offset
447
+ if file_offset0 is None:
448
+ file_offset0 = column.meta_data.data_page_offset
449
+ num_bytes = column.meta_data.total_compressed_size
450
+ if footer_start is None or file_offset0 < footer_start:
451
+ data_paths.append(fn)
452
+ data_starts.append(file_offset0)
453
+ data_ends.append(
454
+ min(
455
+ file_offset0 + num_bytes,
456
+ footer_start or (file_offset0 + num_bytes),
457
+ )
458
+ )
459
+
460
+ if metadata:
461
+ # The metadata in this call may map to multiple
462
+ # file paths. Need to include `data_paths`
463
+ return data_paths, data_starts, data_ends
464
+ return data_starts, data_ends
465
+
466
+
467
+ class PyarrowEngine:
468
+ # The purpose of the PyarrowEngine class is
469
+ # to check if pyarrow can be imported (on initialization)
470
+ # and to define a `_parquet_byte_ranges` method. In the
471
+ # future, this class may also be used to define other
472
+ # methods/logic that are specific to pyarrow.
473
+
474
+ def __init__(self):
475
+ import pyarrow.parquet as pq
476
+
477
+ self.pq = pq
478
+
479
+ def _row_group_filename(self, row_group, metadata):
480
+ raise NotImplementedError
481
+
482
+ def _parquet_byte_ranges(
483
+ self,
484
+ columns,
485
+ row_groups=None,
486
+ metadata=None,
487
+ footer=None,
488
+ footer_start=None,
489
+ ):
490
+ if metadata is not None:
491
+ raise ValueError("metadata input not supported for PyarrowEngine")
492
+
493
+ data_starts, data_ends = [], []
494
+ md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
495
+
496
+ # Convert columns to a set and add any index columns
497
+ # specified in the pandas metadata (just in case)
498
+ column_set = None if columns is None else set(columns)
499
+ if column_set is not None:
500
+ schema = md.schema.to_arrow_schema()
501
+ has_pandas_metadata = (
502
+ schema.metadata is not None and b"pandas" in schema.metadata
503
+ )
504
+ if has_pandas_metadata:
505
+ md_index = [
506
+ ind
507
+ for ind in json.loads(
508
+ schema.metadata[b"pandas"].decode("utf8")
509
+ ).get("index_columns", [])
510
+ # Ignore RangeIndex information
511
+ if not isinstance(ind, dict)
512
+ ]
513
+ column_set |= set(md_index)
514
+
515
+ # Loop through column chunks to add required byte ranges
516
+ for r in range(md.num_row_groups):
517
+ # Skip this row-group if we are targeting
518
+ # specific row-groups
519
+ if row_groups is None or r in row_groups:
520
+ row_group = md.row_group(r)
521
+ for c in range(row_group.num_columns):
522
+ column = row_group.column(c)
523
+ name = column.path_in_schema
524
+ # Skip this column if we are targeting a
525
+ # specific columns
526
+ split_name = name.split(".")[0]
527
+ if (
528
+ column_set is None
529
+ or name in column_set
530
+ or split_name in column_set
531
+ ):
532
+ file_offset0 = column.dictionary_page_offset
533
+ if file_offset0 is None:
534
+ file_offset0 = column.data_page_offset
535
+ num_bytes = column.total_compressed_size
536
+ if file_offset0 < footer_start:
537
+ data_starts.append(file_offset0)
538
+ data_ends.append(
539
+ min(file_offset0 + num_bytes, footer_start)
540
+ )
541
+ return data_starts, data_ends
.venv/lib/python3.11/site-packages/fsspec/registry.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import types
5
+ import warnings
6
+
7
+ __all__ = ["registry", "get_filesystem_class", "default"]
8
+
9
+ # internal, mutable
10
+ _registry: dict[str, type] = {}
11
+
12
+ # external, immutable
13
+ registry = types.MappingProxyType(_registry)
14
+ default = "file"
15
+
16
+
17
+ def register_implementation(name, cls, clobber=False, errtxt=None):
18
+ """Add implementation class to the registry
19
+
20
+ Parameters
21
+ ----------
22
+ name: str
23
+ Protocol name to associate with the class
24
+ cls: class or str
25
+ if a class: fsspec-compliant implementation class (normally inherits from
26
+ ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
27
+ str, the full path to an implementation class like package.module.class,
28
+ which gets added to known_implementations,
29
+ so the import is deferred until the filesystem is actually used.
30
+ clobber: bool (optional)
31
+ Whether to overwrite a protocol with the same name; if False, will raise
32
+ instead.
33
+ errtxt: str (optional)
34
+ If given, then a failure to import the given class will result in this
35
+ text being given.
36
+ """
37
+ if isinstance(cls, str):
38
+ if name in known_implementations and clobber is False:
39
+ if cls != known_implementations[name]["class"]:
40
+ raise ValueError(
41
+ f"Name ({name}) already in the known_implementations and clobber "
42
+ f"is False"
43
+ )
44
+ else:
45
+ known_implementations[name] = {
46
+ "class": cls,
47
+ "err": errtxt or f"{cls} import failed for protocol {name}",
48
+ }
49
+
50
+ else:
51
+ if name in registry and clobber is False:
52
+ if _registry[name] is not cls:
53
+ raise ValueError(
54
+ f"Name ({name}) already in the registry and clobber is False"
55
+ )
56
+ else:
57
+ _registry[name] = cls
58
+
59
+
60
+ # protocols mapped to the class which implements them. This dict can be
61
+ # updated with register_implementation
62
+ known_implementations = {
63
+ "abfs": {
64
+ "class": "adlfs.AzureBlobFileSystem",
65
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
66
+ },
67
+ "adl": {
68
+ "class": "adlfs.AzureDatalakeFileSystem",
69
+ "err": "Install adlfs to access Azure Datalake Gen1",
70
+ },
71
+ "arrow_hdfs": {
72
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
73
+ "err": "pyarrow and local java libraries required for HDFS",
74
+ },
75
+ "asynclocal": {
76
+ "class": "morefs.asyn_local.AsyncLocalFileSystem",
77
+ "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
78
+ },
79
+ "az": {
80
+ "class": "adlfs.AzureBlobFileSystem",
81
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
82
+ },
83
+ "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
84
+ "box": {
85
+ "class": "boxfs.BoxFileSystem",
86
+ "err": "Please install boxfs to access BoxFileSystem",
87
+ },
88
+ "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
89
+ "dask": {
90
+ "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
91
+ "err": "Install dask distributed to access worker file system",
92
+ },
93
+ "data": {"class": "fsspec.implementations.data.DataFileSystem"},
94
+ "dbfs": {
95
+ "class": "fsspec.implementations.dbfs.DatabricksFileSystem",
96
+ "err": "Install the requests package to use the DatabricksFileSystem",
97
+ },
98
+ "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
99
+ "dropbox": {
100
+ "class": "dropboxdrivefs.DropboxDriveFileSystem",
101
+ "err": (
102
+ 'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
103
+ '"dropbox" to be installed'
104
+ ),
105
+ },
106
+ "dvc": {
107
+ "class": "dvc.api.DVCFileSystem",
108
+ "err": "Install dvc to access DVCFileSystem",
109
+ },
110
+ "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
111
+ "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
112
+ "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
113
+ "gcs": {
114
+ "class": "gcsfs.GCSFileSystem",
115
+ "err": "Please install gcsfs to access Google Storage",
116
+ },
117
+ "gdrive": {
118
+ "class": "gdrivefs.GoogleDriveFileSystem",
119
+ "err": "Please install gdrivefs for access to Google Drive",
120
+ },
121
+ "generic": {"class": "fsspec.generic.GenericFileSystem"},
122
+ "git": {
123
+ "class": "fsspec.implementations.git.GitFileSystem",
124
+ "err": "Install pygit2 to browse local git repos",
125
+ },
126
+ "github": {
127
+ "class": "fsspec.implementations.github.GithubFileSystem",
128
+ "err": "Install the requests package to use the github FS",
129
+ },
130
+ "gs": {
131
+ "class": "gcsfs.GCSFileSystem",
132
+ "err": "Please install gcsfs to access Google Storage",
133
+ },
134
+ "hdfs": {
135
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
136
+ "err": "pyarrow and local java libraries required for HDFS",
137
+ },
138
+ "hf": {
139
+ "class": "huggingface_hub.HfFileSystem",
140
+ "err": "Install huggingface_hub to access HfFileSystem",
141
+ },
142
+ "http": {
143
+ "class": "fsspec.implementations.http.HTTPFileSystem",
144
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
145
+ },
146
+ "https": {
147
+ "class": "fsspec.implementations.http.HTTPFileSystem",
148
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
149
+ },
150
+ "jlab": {
151
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
152
+ "err": "Jupyter FS requires requests to be installed",
153
+ },
154
+ "jupyter": {
155
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
156
+ "err": "Jupyter FS requires requests to be installed",
157
+ },
158
+ "lakefs": {
159
+ "class": "lakefs_spec.LakeFSFileSystem",
160
+ "err": "Please install lakefs-spec to access LakeFSFileSystem",
161
+ },
162
+ "libarchive": {
163
+ "class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
164
+ "err": "LibArchive requires to be installed",
165
+ },
166
+ "local": {"class": "fsspec.implementations.local.LocalFileSystem"},
167
+ "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
168
+ "oci": {
169
+ "class": "ocifs.OCIFileSystem",
170
+ "err": "Install ocifs to access OCI Object Storage",
171
+ },
172
+ "ocilake": {
173
+ "class": "ocifs.OCIFileSystem",
174
+ "err": "Install ocifs to access OCI Data Lake",
175
+ },
176
+ "oss": {
177
+ "class": "ossfs.OSSFileSystem",
178
+ "err": "Install ossfs to access Alibaba Object Storage System",
179
+ },
180
+ "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
181
+ "root": {
182
+ "class": "fsspec_xrootd.XRootDFileSystem",
183
+ "err": (
184
+ "Install fsspec-xrootd to access xrootd storage system. "
185
+ "Note: 'root' is the protocol name for xrootd storage systems, "
186
+ "not referring to root directories"
187
+ ),
188
+ },
189
+ "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
190
+ "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
191
+ "sftp": {
192
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
193
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
194
+ },
195
+ "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
196
+ "smb": {
197
+ "class": "fsspec.implementations.smb.SMBFileSystem",
198
+ "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
199
+ },
200
+ "ssh": {
201
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
202
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
203
+ },
204
+ "tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
205
+ "tosfs": {
206
+ "class": "tosfs.TosFileSystem",
207
+ "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
208
+ },
209
+ "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
210
+ "webdav": {
211
+ "class": "webdav4.fsspec.WebdavFileSystem",
212
+ "err": "Install webdav4 to access WebDAV",
213
+ },
214
+ "webhdfs": {
215
+ "class": "fsspec.implementations.webhdfs.WebHDFS",
216
+ "err": 'webHDFS access requires "requests" to be installed',
217
+ },
218
+ "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
219
+ }
220
+
221
+ assert list(known_implementations) == sorted(known_implementations), (
222
+ "Not in alphabetical order"
223
+ )
224
+
225
+
226
+ def get_filesystem_class(protocol):
227
+ """Fetch named protocol implementation from the registry
228
+
229
+ The dict ``known_implementations`` maps protocol names to the locations
230
+ of classes implementing the corresponding file-system. When used for the
231
+ first time, appropriate imports will happen and the class will be placed in
232
+ the registry. All subsequent calls will fetch directly from the registry.
233
+
234
+ Some protocol implementations require additional dependencies, and so the
235
+ import may fail. In this case, the string in the "err" field of the
236
+ ``known_implementations`` will be given as the error message.
237
+ """
238
+ if not protocol:
239
+ protocol = default
240
+
241
+ if protocol not in registry:
242
+ if protocol not in known_implementations:
243
+ raise ValueError(f"Protocol not known: {protocol}")
244
+ bit = known_implementations[protocol]
245
+ try:
246
+ register_implementation(protocol, _import_class(bit["class"]))
247
+ except ImportError as e:
248
+ raise ImportError(bit["err"]) from e
249
+ cls = registry[protocol]
250
+ if getattr(cls, "protocol", None) in ("abstract", None):
251
+ cls.protocol = protocol
252
+
253
+ return cls
254
+
255
+
256
+ s3_msg = """Your installed version of s3fs is very old and known to cause
257
+ severe performance issues, see also https://github.com/dask/dask/issues/10276
258
+
259
+ To fix, you should specify a lower version bound on s3fs, or
260
+ update the current installation.
261
+ """
262
+
263
+
264
+ def _import_class(fqp: str):
265
+ """Take a fully-qualified path and return the imported class or identifier.
266
+
267
+ ``fqp`` is of the form "package.module.klass" or
268
+ "package.module:subobject.klass".
269
+
270
+ Warnings
271
+ --------
272
+ This can import arbitrary modules. Make sure you haven't installed any modules
273
+ that may execute malicious code at import time.
274
+ """
275
+ if ":" in fqp:
276
+ mod, name = fqp.rsplit(":", 1)
277
+ else:
278
+ mod, name = fqp.rsplit(".", 1)
279
+
280
+ is_s3 = mod == "s3fs"
281
+ mod = importlib.import_module(mod)
282
+ if is_s3 and mod.__version__.split(".") < ["0", "5"]:
283
+ warnings.warn(s3_msg)
284
+ for part in name.split("."):
285
+ mod = getattr(mod, part)
286
+
287
+ if not isinstance(mod, type):
288
+ raise TypeError(f"{fqp} is not a class")
289
+
290
+ return mod
291
+
292
+
293
+ def filesystem(protocol, **storage_options):
294
+ """Instantiate filesystems for given protocol and arguments
295
+
296
+ ``storage_options`` are specific to the protocol being chosen, and are
297
+ passed directly to the class.
298
+ """
299
+ if protocol == "arrow_hdfs":
300
+ warnings.warn(
301
+ "The 'arrow_hdfs' protocol has been deprecated and will be "
302
+ "removed in the future. Specify it as 'hdfs'.",
303
+ DeprecationWarning,
304
+ )
305
+
306
+ cls = get_filesystem_class(protocol)
307
+ return cls(**storage_options)
308
+
309
+
310
+ def available_protocols():
311
+ """Return a list of the implemented protocols.
312
+
313
+ Note that any given protocol may require extra packages to be importable.
314
+ """
315
+ return list(known_implementations)
.venv/lib/python3.11/site-packages/fsspec/spec.py ADDED
@@ -0,0 +1,2242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import threading
8
+ import warnings
9
+ import weakref
10
+ from errno import ESPIPE
11
+ from glob import has_magic
12
+ from hashlib import sha256
13
+ from typing import Any, ClassVar
14
+
15
+ from .callbacks import DEFAULT_CALLBACK
16
+ from .config import apply_config, conf
17
+ from .dircache import DirCache
18
+ from .transaction import Transaction
19
+ from .utils import (
20
+ _unstrip_protocol,
21
+ glob_translate,
22
+ isfilelike,
23
+ other_paths,
24
+ read_block,
25
+ stringify_path,
26
+ tokenize,
27
+ )
28
+
29
+ logger = logging.getLogger("fsspec")
30
+
31
+
32
+ def make_instance(cls, args, kwargs):
33
+ return cls(*args, **kwargs)
34
+
35
+
36
+ class _Cached(type):
37
+ """
38
+ Metaclass for caching file system instances.
39
+
40
+ Notes
41
+ -----
42
+ Instances are cached according to
43
+
44
+ * The values of the class attributes listed in `_extra_tokenize_attributes`
45
+ * The arguments passed to ``__init__``.
46
+
47
+ This creates an additional reference to the filesystem, which prevents the
48
+ filesystem from being garbage collected when all *user* references go away.
49
+ A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
50
+ be made for a filesystem instance to be garbage collected.
51
+ """
52
+
53
+ def __init__(cls, *args, **kwargs):
54
+ super().__init__(*args, **kwargs)
55
+ # Note: we intentionally create a reference here, to avoid garbage
56
+ # collecting instances when all other references are gone. To really
57
+ # delete a FileSystem, the cache must be cleared.
58
+ if conf.get("weakref_instance_cache"): # pragma: no cover
59
+ # debug option for analysing fork/spawn conditions
60
+ cls._cache = weakref.WeakValueDictionary()
61
+ else:
62
+ cls._cache = {}
63
+ cls._pid = os.getpid()
64
+
65
+ def __call__(cls, *args, **kwargs):
66
+ kwargs = apply_config(cls, kwargs)
67
+ extra_tokens = tuple(
68
+ getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
69
+ )
70
+ token = tokenize(
71
+ cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
72
+ )
73
+ skip = kwargs.pop("skip_instance_cache", False)
74
+ if os.getpid() != cls._pid:
75
+ cls._cache.clear()
76
+ cls._pid = os.getpid()
77
+ if not skip and cls.cachable and token in cls._cache:
78
+ cls._latest = token
79
+ return cls._cache[token]
80
+ else:
81
+ obj = super().__call__(*args, **kwargs)
82
+ # Setting _fs_token here causes some static linters to complain.
83
+ obj._fs_token_ = token
84
+ obj.storage_args = args
85
+ obj.storage_options = kwargs
86
+ if obj.async_impl and obj.mirror_sync_methods:
87
+ from .asyn import mirror_sync_methods
88
+
89
+ mirror_sync_methods(obj)
90
+
91
+ if cls.cachable and not skip:
92
+ cls._latest = token
93
+ cls._cache[token] = obj
94
+ return obj
95
+
96
+
97
+ class AbstractFileSystem(metaclass=_Cached):
98
+ """
99
+ An abstract super-class for pythonic file-systems
100
+
101
+ Implementations are expected to be compatible with or, better, subclass
102
+ from here.
103
+ """
104
+
105
+ cachable = True # this class can be cached, instances reused
106
+ _cached = False
107
+ blocksize = 2**22
108
+ sep = "/"
109
+ protocol: ClassVar[str | tuple[str, ...]] = "abstract"
110
+ _latest = None
111
+ async_impl = False
112
+ mirror_sync_methods = False
113
+ root_marker = "" # For some FSs, may require leading '/' or other character
114
+ transaction_type = Transaction
115
+
116
+ #: Extra *class attributes* that should be considered when hashing.
117
+ _extra_tokenize_attributes = ()
118
+
119
+ # Set by _Cached metaclass
120
+ storage_args: tuple[Any, ...]
121
+ storage_options: dict[str, Any]
122
+
123
+ def __init__(self, *args, **storage_options):
124
+ """Create and configure file-system instance
125
+
126
+ Instances may be cachable, so if similar enough arguments are seen
127
+ a new instance is not required. The token attribute exists to allow
128
+ implementations to cache instances if they wish.
129
+
130
+ A reasonable default should be provided if there are no arguments.
131
+
132
+ Subclasses should call this method.
133
+
134
+ Parameters
135
+ ----------
136
+ use_listings_cache, listings_expiry_time, max_paths:
137
+ passed to ``DirCache``, if the implementation supports
138
+ directory listing caching. Pass use_listings_cache=False
139
+ to disable such caching.
140
+ skip_instance_cache: bool
141
+ If this is a cachable implementation, pass True here to force
142
+ creating a new instance even if a matching instance exists, and prevent
143
+ storing this instance.
144
+ asynchronous: bool
145
+ loop: asyncio-compatible IOLoop or None
146
+ """
147
+ if self._cached:
148
+ # reusing instance, don't change
149
+ return
150
+ self._cached = True
151
+ self._intrans = False
152
+ self._transaction = None
153
+ self._invalidated_caches_in_transaction = []
154
+ self.dircache = DirCache(**storage_options)
155
+
156
+ if storage_options.pop("add_docs", None):
157
+ warnings.warn("add_docs is no longer supported.", FutureWarning)
158
+
159
+ if storage_options.pop("add_aliases", None):
160
+ warnings.warn("add_aliases has been removed.", FutureWarning)
161
+ # This is set in _Cached
162
+ self._fs_token_ = None
163
+
164
+ @property
165
+ def fsid(self):
166
+ """Persistent filesystem id that can be used to compare filesystems
167
+ across sessions.
168
+ """
169
+ raise NotImplementedError
170
+
171
+ @property
172
+ def _fs_token(self):
173
+ return self._fs_token_
174
+
175
+ def __dask_tokenize__(self):
176
+ return self._fs_token
177
+
178
+ def __hash__(self):
179
+ return int(self._fs_token, 16)
180
+
181
+ def __eq__(self, other):
182
+ return isinstance(other, type(self)) and self._fs_token == other._fs_token
183
+
184
+ def __reduce__(self):
185
+ return make_instance, (type(self), self.storage_args, self.storage_options)
186
+
187
+ @classmethod
188
+ def _strip_protocol(cls, path):
189
+ """Turn path from fully-qualified to file-system-specific
190
+
191
+ May require FS-specific handling, e.g., for relative paths or links.
192
+ """
193
+ if isinstance(path, list):
194
+ return [cls._strip_protocol(p) for p in path]
195
+ path = stringify_path(path)
196
+ protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
197
+ for protocol in protos:
198
+ if path.startswith(protocol + "://"):
199
+ path = path[len(protocol) + 3 :]
200
+ elif path.startswith(protocol + "::"):
201
+ path = path[len(protocol) + 2 :]
202
+ path = path.rstrip("/")
203
+ # use of root_marker to make minimum required path, e.g., "/"
204
+ return path or cls.root_marker
205
+
206
+ def unstrip_protocol(self, name: str) -> str:
207
+ """Format FS-specific path to generic, including protocol"""
208
+ protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
209
+ for protocol in protos:
210
+ if name.startswith(f"{protocol}://"):
211
+ return name
212
+ return f"{protos[0]}://{name}"
213
+
214
+ @staticmethod
215
+ def _get_kwargs_from_urls(path):
216
+ """If kwargs can be encoded in the paths, extract them here
217
+
218
+ This should happen before instantiation of the class; incoming paths
219
+ then should be amended to strip the options in methods.
220
+
221
+ Examples may look like an sftp path "sftp://user@host:/my/path", where
222
+ the user and host should become kwargs and later get stripped.
223
+ """
224
+ # by default, nothing happens
225
+ return {}
226
+
227
+ @classmethod
228
+ def current(cls):
229
+ """Return the most recently instantiated FileSystem
230
+
231
+ If no instance has been created, then create one with defaults
232
+ """
233
+ if cls._latest in cls._cache:
234
+ return cls._cache[cls._latest]
235
+ return cls()
236
+
237
+ @property
238
+ def transaction(self):
239
+ """A context within which files are committed together upon exit
240
+
241
+ Requires the file class to implement `.commit()` and `.discard()`
242
+ for the normal and exception cases.
243
+ """
244
+ if self._transaction is None:
245
+ self._transaction = self.transaction_type(self)
246
+ return self._transaction
247
+
248
+ def start_transaction(self):
249
+ """Begin write transaction for deferring files, non-context version"""
250
+ self._intrans = True
251
+ self._transaction = self.transaction_type(self)
252
+ return self.transaction
253
+
254
+ def end_transaction(self):
255
+ """Finish write transaction, non-context version"""
256
+ self.transaction.complete()
257
+ self._transaction = None
258
+ # The invalid cache must be cleared after the transaction is completed.
259
+ for path in self._invalidated_caches_in_transaction:
260
+ self.invalidate_cache(path)
261
+ self._invalidated_caches_in_transaction.clear()
262
+
263
+ def invalidate_cache(self, path=None):
264
+ """
265
+ Discard any cached directory information
266
+
267
+ Parameters
268
+ ----------
269
+ path: string or None
270
+ If None, clear all listings cached else listings at or under given
271
+ path.
272
+ """
273
+ # Not necessary to implement invalidation mechanism, may have no cache.
274
+ # But if have, you should call this method of parent class from your
275
+ # subclass to ensure expiring caches after transacations correctly.
276
+ # See the implementation of FTPFileSystem in ftp.py
277
+ if self._intrans:
278
+ self._invalidated_caches_in_transaction.append(path)
279
+
280
+ def mkdir(self, path, create_parents=True, **kwargs):
281
+ """
282
+ Create directory entry at path
283
+
284
+ For systems that don't have true directories, may create an for
285
+ this instance only and not touch the real filesystem
286
+
287
+ Parameters
288
+ ----------
289
+ path: str
290
+ location
291
+ create_parents: bool
292
+ if True, this is equivalent to ``makedirs``
293
+ kwargs:
294
+ may be permissions, etc.
295
+ """
296
+ pass # not necessary to implement, may not have directories
297
+
298
+ def makedirs(self, path, exist_ok=False):
299
+ """Recursively make directories
300
+
301
+ Creates directory at path and any intervening required directories.
302
+ Raises exception if, for instance, the path already exists but is a
303
+ file.
304
+
305
+ Parameters
306
+ ----------
307
+ path: str
308
+ leaf directory name
309
+ exist_ok: bool (False)
310
+ If False, will error if the target already exists
311
+ """
312
+ pass # not necessary to implement, may not have directories
313
+
314
+ def rmdir(self, path):
315
+ """Remove a directory, if empty"""
316
+ pass # not necessary to implement, may not have directories
317
+
318
+ def ls(self, path, detail=True, **kwargs):
319
+ """List objects at path.
320
+
321
+ This should include subdirectories and files at that location. The
322
+ difference between a file and a directory must be clear when details
323
+ are requested.
324
+
325
+ The specific keys, or perhaps a FileInfo class, or similar, is TBD,
326
+ but must be consistent across implementations.
327
+ Must include:
328
+
329
+ - full path to the entry (without protocol)
330
+ - size of the entry, in bytes. If the value cannot be determined, will
331
+ be ``None``.
332
+ - type of entry, "file", "directory" or other
333
+
334
+ Additional information
335
+ may be present, appropriate to the file-system, e.g., generation,
336
+ checksum, etc.
337
+
338
+ May use refresh=True|False to allow use of self._ls_from_cache to
339
+ check for a saved listing and avoid calling the backend. This would be
340
+ common where listing may be expensive.
341
+
342
+ Parameters
343
+ ----------
344
+ path: str
345
+ detail: bool
346
+ if True, gives a list of dictionaries, where each is the same as
347
+ the result of ``info(path)``. If False, gives a list of paths
348
+ (str).
349
+ kwargs: may have additional backend-specific options, such as version
350
+ information
351
+
352
+ Returns
353
+ -------
354
+ List of strings if detail is False, or list of directory information
355
+ dicts if detail is True.
356
+ """
357
+ raise NotImplementedError
358
+
359
+ def _ls_from_cache(self, path):
360
+ """Check cache for listing
361
+
362
+ Returns listing, if found (may be empty list for a directly that exists
363
+ but contains nothing), None if not in cache.
364
+ """
365
+ parent = self._parent(path)
366
+ try:
367
+ return self.dircache[path.rstrip("/")]
368
+ except KeyError:
369
+ pass
370
+ try:
371
+ files = [
372
+ f
373
+ for f in self.dircache[parent]
374
+ if f["name"] == path
375
+ or (f["name"] == path.rstrip("/") and f["type"] == "directory")
376
+ ]
377
+ if len(files) == 0:
378
+ # parent dir was listed but did not contain this file
379
+ raise FileNotFoundError(path)
380
+ return files
381
+ except KeyError:
382
+ pass
383
+
384
+ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
385
+ """Return all files under the given path.
386
+
387
+ List all files, recursing into subdirectories; output is iterator-style,
388
+ like ``os.walk()``. For a simple list of files, ``find()`` is available.
389
+
390
+ When topdown is True, the caller can modify the dirnames list in-place (perhaps
391
+ using del or slice assignment), and walk() will
392
+ only recurse into the subdirectories whose names remain in dirnames;
393
+ this can be used to prune the search, impose a specific order of visiting,
394
+ or even to inform walk() about directories the caller creates or renames before
395
+ it resumes walk() again.
396
+ Modifying dirnames when topdown is False has no effect. (see os.walk)
397
+
398
+ Note that the "files" outputted will include anything that is not
399
+ a directory, such as links.
400
+
401
+ Parameters
402
+ ----------
403
+ path: str
404
+ Root to recurse into
405
+ maxdepth: int
406
+ Maximum recursion depth. None means limitless, but not recommended
407
+ on link-based file-systems.
408
+ topdown: bool (True)
409
+ Whether to walk the directory tree from the top downwards or from
410
+ the bottom upwards.
411
+ on_error: "omit", "raise", a callable
412
+ if omit (default), path with exception will simply be empty;
413
+ If raise, an underlying exception will be raised;
414
+ if callable, it will be called with a single OSError instance as argument
415
+ kwargs: passed to ``ls``
416
+ """
417
+ if maxdepth is not None and maxdepth < 1:
418
+ raise ValueError("maxdepth must be at least 1")
419
+
420
+ path = self._strip_protocol(path)
421
+ full_dirs = {}
422
+ dirs = {}
423
+ files = {}
424
+
425
+ detail = kwargs.pop("detail", False)
426
+ try:
427
+ listing = self.ls(path, detail=True, **kwargs)
428
+ except (FileNotFoundError, OSError) as e:
429
+ if on_error == "raise":
430
+ raise
431
+ if callable(on_error):
432
+ on_error(e)
433
+ return
434
+
435
+ for info in listing:
436
+ # each info name must be at least [path]/part , but here
437
+ # we check also for names like [path]/part/
438
+ pathname = info["name"].rstrip("/")
439
+ name = pathname.rsplit("/", 1)[-1]
440
+ if info["type"] == "directory" and pathname != path:
441
+ # do not include "self" path
442
+ full_dirs[name] = pathname
443
+ dirs[name] = info
444
+ elif pathname == path:
445
+ # file-like with same name as give path
446
+ files[""] = info
447
+ else:
448
+ files[name] = info
449
+
450
+ if not detail:
451
+ dirs = list(dirs)
452
+ files = list(files)
453
+
454
+ if topdown:
455
+ # Yield before recursion if walking top down
456
+ yield path, dirs, files
457
+
458
+ if maxdepth is not None:
459
+ maxdepth -= 1
460
+ if maxdepth < 1:
461
+ if not topdown:
462
+ yield path, dirs, files
463
+ return
464
+
465
+ for d in dirs:
466
+ yield from self.walk(
467
+ full_dirs[d],
468
+ maxdepth=maxdepth,
469
+ detail=detail,
470
+ topdown=topdown,
471
+ **kwargs,
472
+ )
473
+
474
+ if not topdown:
475
+ # Yield after recursion if walking bottom up
476
+ yield path, dirs, files
477
+
478
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
479
+ """List all files below path.
480
+
481
+ Like posix ``find`` command without conditions
482
+
483
+ Parameters
484
+ ----------
485
+ path : str
486
+ maxdepth: int or None
487
+ If not None, the maximum number of levels to descend
488
+ withdirs: bool
489
+ Whether to include directory paths in the output. This is True
490
+ when used by glob, but users usually only want files.
491
+ kwargs are passed to ``ls``.
492
+ """
493
+ # TODO: allow equivalent of -name parameter
494
+ path = self._strip_protocol(path)
495
+ out = {}
496
+
497
+ # Add the root directory if withdirs is requested
498
+ # This is needed for posix glob compliance
499
+ if withdirs and path != "" and self.isdir(path):
500
+ out[path] = self.info(path)
501
+
502
+ for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
503
+ if withdirs:
504
+ files.update(dirs)
505
+ out.update({info["name"]: info for name, info in files.items()})
506
+ if not out and self.isfile(path):
507
+ # walk works on directories, but find should also return [path]
508
+ # when path happens to be a file
509
+ out[path] = {}
510
+ names = sorted(out)
511
+ if not detail:
512
+ return names
513
+ else:
514
+ return {name: out[name] for name in names}
515
+
516
+ def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
517
+ """Space used by files and optionally directories within a path
518
+
519
+ Directory size does not include the size of its contents.
520
+
521
+ Parameters
522
+ ----------
523
+ path: str
524
+ total: bool
525
+ Whether to sum all the file sizes
526
+ maxdepth: int or None
527
+ Maximum number of directory levels to descend, None for unlimited.
528
+ withdirs: bool
529
+ Whether to include directory paths in the output.
530
+ kwargs: passed to ``find``
531
+
532
+ Returns
533
+ -------
534
+ Dict of {path: size} if total=False, or int otherwise, where numbers
535
+ refer to bytes used.
536
+ """
537
+ sizes = {}
538
+ if withdirs and self.isdir(path):
539
+ # Include top-level directory in output
540
+ info = self.info(path)
541
+ sizes[info["name"]] = info["size"]
542
+ for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
543
+ info = self.info(f)
544
+ sizes[info["name"]] = info["size"]
545
+ if total:
546
+ return sum(sizes.values())
547
+ else:
548
+ return sizes
549
+
550
+ def glob(self, path, maxdepth=None, **kwargs):
551
+ """
552
+ Find files by glob-matching.
553
+
554
+ If the path ends with '/', only folders are returned.
555
+
556
+ We support ``"**"``,
557
+ ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.
558
+
559
+ The `maxdepth` option is applied on the first `**` found in the path.
560
+
561
+ kwargs are passed to ``ls``.
562
+ """
563
+ if maxdepth is not None and maxdepth < 1:
564
+ raise ValueError("maxdepth must be at least 1")
565
+
566
+ import re
567
+
568
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
569
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
570
+ path = self._strip_protocol(path)
571
+ append_slash_to_dirname = ends_with_sep or path.endswith(
572
+ tuple(sep + "**" for sep in seps)
573
+ )
574
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
575
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
576
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
577
+
578
+ min_idx = min(idx_star, idx_qmark, idx_brace)
579
+
580
+ detail = kwargs.pop("detail", False)
581
+
582
+ if not has_magic(path):
583
+ if self.exists(path, **kwargs):
584
+ if not detail:
585
+ return [path]
586
+ else:
587
+ return {path: self.info(path, **kwargs)}
588
+ else:
589
+ if not detail:
590
+ return [] # glob of non-existent returns empty
591
+ else:
592
+ return {}
593
+ elif "/" in path[:min_idx]:
594
+ min_idx = path[:min_idx].rindex("/")
595
+ root = path[: min_idx + 1]
596
+ depth = path[min_idx + 1 :].count("/") + 1
597
+ else:
598
+ root = ""
599
+ depth = path[min_idx + 1 :].count("/") + 1
600
+
601
+ if "**" in path:
602
+ if maxdepth is not None:
603
+ idx_double_stars = path.find("**")
604
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
605
+ depth = depth - depth_double_stars + maxdepth
606
+ else:
607
+ depth = None
608
+
609
+ allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
610
+
611
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
612
+ pattern = re.compile(pattern)
613
+
614
+ out = {
615
+ p: info
616
+ for p, info in sorted(allpaths.items())
617
+ if pattern.match(
618
+ p + "/"
619
+ if append_slash_to_dirname and info["type"] == "directory"
620
+ else p
621
+ )
622
+ }
623
+
624
+ if detail:
625
+ return out
626
+ else:
627
+ return list(out)
628
+
629
+ def exists(self, path, **kwargs):
630
+ """Is there a file at the given path"""
631
+ try:
632
+ self.info(path, **kwargs)
633
+ return True
634
+ except: # noqa: E722
635
+ # any exception allowed bar FileNotFoundError?
636
+ return False
637
+
638
+ def lexists(self, path, **kwargs):
639
+ """If there is a file at the given path (including
640
+ broken links)"""
641
+ return self.exists(path)
642
+
643
+ def info(self, path, **kwargs):
644
+ """Give details of entry at path
645
+
646
+ Returns a single dictionary, with exactly the same information as ``ls``
647
+ would with ``detail=True``.
648
+
649
+ The default implementation calls ls and could be overridden by a
650
+ shortcut. kwargs are passed on to ```ls()``.
651
+
652
+ Some file systems might not be able to measure the file's size, in
653
+ which case, the returned dict will include ``'size': None``.
654
+
655
+ Returns
656
+ -------
657
+ dict with keys: name (full path in the FS), size (in bytes), type (file,
658
+ directory, or something else) and other FS-specific keys.
659
+ """
660
+ path = self._strip_protocol(path)
661
+ out = self.ls(self._parent(path), detail=True, **kwargs)
662
+ out = [o for o in out if o["name"].rstrip("/") == path]
663
+ if out:
664
+ return out[0]
665
+ out = self.ls(path, detail=True, **kwargs)
666
+ path = path.rstrip("/")
667
+ out1 = [o for o in out if o["name"].rstrip("/") == path]
668
+ if len(out1) == 1:
669
+ if "size" not in out1[0]:
670
+ out1[0]["size"] = None
671
+ return out1[0]
672
+ elif len(out1) > 1 or out:
673
+ return {"name": path, "size": 0, "type": "directory"}
674
+ else:
675
+ raise FileNotFoundError(path)
676
+
677
+ def checksum(self, path):
678
+ """Unique value for current version of file
679
+
680
+ If the checksum is the same from one moment to another, the contents
681
+ are guaranteed to be the same. If the checksum changes, the contents
682
+ *might* have changed.
683
+
684
+ This should normally be overridden; default will probably capture
685
+ creation/modification timestamp (which would be good) or maybe
686
+ access timestamp (which would be bad)
687
+ """
688
+ return int(tokenize(self.info(path)), 16)
689
+
690
+ def size(self, path):
691
+ """Size in bytes of file"""
692
+ return self.info(path).get("size", None)
693
+
694
+ def sizes(self, paths):
695
+ """Size in bytes of each file in a list of paths"""
696
+ return [self.size(p) for p in paths]
697
+
698
+ def isdir(self, path):
699
+ """Is this entry directory-like?"""
700
+ try:
701
+ return self.info(path)["type"] == "directory"
702
+ except OSError:
703
+ return False
704
+
705
+ def isfile(self, path):
706
+ """Is this entry file-like?"""
707
+ try:
708
+ return self.info(path)["type"] == "file"
709
+ except: # noqa: E722
710
+ return False
711
+
712
+ def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
713
+ """Get the contents of the file as a string.
714
+
715
+ Parameters
716
+ ----------
717
+ path: str
718
+ URL of file on this filesystems
719
+ encoding, errors, newline: same as `open`.
720
+ """
721
+ with self.open(
722
+ path,
723
+ mode="r",
724
+ encoding=encoding,
725
+ errors=errors,
726
+ newline=newline,
727
+ **kwargs,
728
+ ) as f:
729
+ return f.read()
730
+
731
+ def write_text(
732
+ self, path, value, encoding=None, errors=None, newline=None, **kwargs
733
+ ):
734
+ """Write the text to the given file.
735
+
736
+ An existing file will be overwritten.
737
+
738
+ Parameters
739
+ ----------
740
+ path: str
741
+ URL of file on this filesystems
742
+ value: str
743
+ Text to write.
744
+ encoding, errors, newline: same as `open`.
745
+ """
746
+ with self.open(
747
+ path,
748
+ mode="w",
749
+ encoding=encoding,
750
+ errors=errors,
751
+ newline=newline,
752
+ **kwargs,
753
+ ) as f:
754
+ return f.write(value)
755
+
756
+ def cat_file(self, path, start=None, end=None, **kwargs):
757
+ """Get the content of a file
758
+
759
+ Parameters
760
+ ----------
761
+ path: URL of file on this filesystems
762
+ start, end: int
763
+ Bytes limits of the read. If negative, backwards from end,
764
+ like usual python slices. Either can be None for start or
765
+ end of file, respectively
766
+ kwargs: passed to ``open()``.
767
+ """
768
+ # explicitly set buffering off?
769
+ with self.open(path, "rb", **kwargs) as f:
770
+ if start is not None:
771
+ if start >= 0:
772
+ f.seek(start)
773
+ else:
774
+ f.seek(max(0, f.size + start))
775
+ if end is not None:
776
+ if end < 0:
777
+ end = f.size + end
778
+ return f.read(end - f.tell())
779
+ return f.read()
780
+
781
+ def pipe_file(self, path, value, mode="overwrite", **kwargs):
782
+ """Set the bytes of given file"""
783
+ if mode == "create" and self.exists(path):
784
+ # non-atomic but simple way; or could use "xb" in open(), which is likely
785
+ # not as well supported
786
+ raise FileExistsError
787
+ with self.open(path, "wb", **kwargs) as f:
788
+ f.write(value)
789
+
790
+ def pipe(self, path, value=None, **kwargs):
791
+ """Put value into path
792
+
793
+ (counterpart to ``cat``)
794
+
795
+ Parameters
796
+ ----------
797
+ path: string or dict(str, bytes)
798
+ If a string, a single remote location to put ``value`` bytes; if a dict,
799
+ a mapping of {path: bytesvalue}.
800
+ value: bytes, optional
801
+ If using a single path, these are the bytes to put there. Ignored if
802
+ ``path`` is a dict
803
+ """
804
+ if isinstance(path, str):
805
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
806
+ elif isinstance(path, dict):
807
+ for k, v in path.items():
808
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
809
+ else:
810
+ raise ValueError("path must be str or dict")
811
+
812
+ def cat_ranges(
813
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
814
+ ):
815
+ """Get the contents of byte ranges from one or more files
816
+
817
+ Parameters
818
+ ----------
819
+ paths: list
820
+ A list of of filepaths on this filesystems
821
+ starts, ends: int or list
822
+ Bytes limits of the read. If using a single int, the same value will be
823
+ used to read all the specified files.
824
+ """
825
+ if max_gap is not None:
826
+ raise NotImplementedError
827
+ if not isinstance(paths, list):
828
+ raise TypeError
829
+ if not isinstance(starts, list):
830
+ starts = [starts] * len(paths)
831
+ if not isinstance(ends, list):
832
+ ends = [ends] * len(paths)
833
+ if len(starts) != len(paths) or len(ends) != len(paths):
834
+ raise ValueError
835
+ out = []
836
+ for p, s, e in zip(paths, starts, ends):
837
+ try:
838
+ out.append(self.cat_file(p, s, e))
839
+ except Exception as e:
840
+ if on_error == "return":
841
+ out.append(e)
842
+ else:
843
+ raise
844
+ return out
845
+
846
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
847
+ """Fetch (potentially multiple) paths' contents
848
+
849
+ Parameters
850
+ ----------
851
+ recursive: bool
852
+ If True, assume the path(s) are directories, and get all the
853
+ contained files
854
+ on_error : "raise", "omit", "return"
855
+ If raise, an underlying exception will be raised (converted to KeyError
856
+ if the type is in self.missing_exceptions); if omit, keys with exception
857
+ will simply not be included in the output; if "return", all keys are
858
+ included in the output, but the value will be bytes or an exception
859
+ instance.
860
+ kwargs: passed to cat_file
861
+
862
+ Returns
863
+ -------
864
+ dict of {path: contents} if there are multiple paths
865
+ or the path has been otherwise expanded
866
+ """
867
+ paths = self.expand_path(path, recursive=recursive)
868
+ if (
869
+ len(paths) > 1
870
+ or isinstance(path, list)
871
+ or paths[0] != self._strip_protocol(path)
872
+ ):
873
+ out = {}
874
+ for path in paths:
875
+ try:
876
+ out[path] = self.cat_file(path, **kwargs)
877
+ except Exception as e:
878
+ if on_error == "raise":
879
+ raise
880
+ if on_error == "return":
881
+ out[path] = e
882
+ return out
883
+ else:
884
+ return self.cat_file(paths[0], **kwargs)
885
+
886
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
887
+ """Copy single remote file to local"""
888
+ from .implementations.local import LocalFileSystem
889
+
890
+ if isfilelike(lpath):
891
+ outfile = lpath
892
+ elif self.isdir(rpath):
893
+ os.makedirs(lpath, exist_ok=True)
894
+ return None
895
+
896
+ fs = LocalFileSystem(auto_mkdir=True)
897
+ fs.makedirs(fs._parent(lpath), exist_ok=True)
898
+
899
+ with self.open(rpath, "rb", **kwargs) as f1:
900
+ if outfile is None:
901
+ outfile = open(lpath, "wb")
902
+
903
+ try:
904
+ callback.set_size(getattr(f1, "size", None))
905
+ data = True
906
+ while data:
907
+ data = f1.read(self.blocksize)
908
+ segment_len = outfile.write(data)
909
+ if segment_len is None:
910
+ segment_len = len(data)
911
+ callback.relative_update(segment_len)
912
+ finally:
913
+ if not isfilelike(lpath):
914
+ outfile.close()
915
+
916
+ def get(
917
+ self,
918
+ rpath,
919
+ lpath,
920
+ recursive=False,
921
+ callback=DEFAULT_CALLBACK,
922
+ maxdepth=None,
923
+ **kwargs,
924
+ ):
925
+ """Copy file(s) to local.
926
+
927
+ Copies a specific file or tree of files (if recursive=True). If lpath
928
+ ends with a "/", it will be assumed to be a directory, and target files
929
+ will go within. Can submit a list of paths, which may be glob-patterns
930
+ and will be expanded.
931
+
932
+ Calls get_file for each source.
933
+ """
934
+ if isinstance(lpath, list) and isinstance(rpath, list):
935
+ # No need to expand paths when both source and destination
936
+ # are provided as lists
937
+ rpaths = rpath
938
+ lpaths = lpath
939
+ else:
940
+ from .implementations.local import (
941
+ LocalFileSystem,
942
+ make_path_posix,
943
+ trailing_sep,
944
+ )
945
+
946
+ source_is_str = isinstance(rpath, str)
947
+ rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
948
+ if source_is_str and (not recursive or maxdepth is not None):
949
+ # Non-recursive glob does not copy directories
950
+ rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
951
+ if not rpaths:
952
+ return
953
+
954
+ if isinstance(lpath, str):
955
+ lpath = make_path_posix(lpath)
956
+
957
+ source_is_file = len(rpaths) == 1
958
+ dest_is_dir = isinstance(lpath, str) and (
959
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
960
+ )
961
+
962
+ exists = source_is_str and (
963
+ (has_magic(rpath) and source_is_file)
964
+ or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
965
+ )
966
+ lpaths = other_paths(
967
+ rpaths,
968
+ lpath,
969
+ exists=exists,
970
+ flatten=not source_is_str,
971
+ )
972
+
973
+ callback.set_size(len(lpaths))
974
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
975
+ with callback.branched(rpath, lpath) as child:
976
+ self.get_file(rpath, lpath, callback=child, **kwargs)
977
+
978
+ def put_file(
979
+ self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
980
+ ):
981
+ """Copy single file to remote"""
982
+ if mode == "create" and self.exists(rpath):
983
+ raise FileExistsError
984
+ if os.path.isdir(lpath):
985
+ self.makedirs(rpath, exist_ok=True)
986
+ return None
987
+
988
+ with open(lpath, "rb") as f1:
989
+ size = f1.seek(0, 2)
990
+ callback.set_size(size)
991
+ f1.seek(0)
992
+
993
+ self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
994
+ with self.open(rpath, "wb", **kwargs) as f2:
995
+ while f1.tell() < size:
996
+ data = f1.read(self.blocksize)
997
+ segment_len = f2.write(data)
998
+ if segment_len is None:
999
+ segment_len = len(data)
1000
+ callback.relative_update(segment_len)
1001
+
1002
+ def put(
1003
+ self,
1004
+ lpath,
1005
+ rpath,
1006
+ recursive=False,
1007
+ callback=DEFAULT_CALLBACK,
1008
+ maxdepth=None,
1009
+ **kwargs,
1010
+ ):
1011
+ """Copy file(s) from local.
1012
+
1013
+ Copies a specific file or tree of files (if recursive=True). If rpath
1014
+ ends with a "/", it will be assumed to be a directory, and target files
1015
+ will go within.
1016
+
1017
+ Calls put_file for each source.
1018
+ """
1019
+ if isinstance(lpath, list) and isinstance(rpath, list):
1020
+ # No need to expand paths when both source and destination
1021
+ # are provided as lists
1022
+ rpaths = rpath
1023
+ lpaths = lpath
1024
+ else:
1025
+ from .implementations.local import (
1026
+ LocalFileSystem,
1027
+ make_path_posix,
1028
+ trailing_sep,
1029
+ )
1030
+
1031
+ source_is_str = isinstance(lpath, str)
1032
+ if source_is_str:
1033
+ lpath = make_path_posix(lpath)
1034
+ fs = LocalFileSystem()
1035
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
1036
+ if source_is_str and (not recursive or maxdepth is not None):
1037
+ # Non-recursive glob does not copy directories
1038
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
1039
+ if not lpaths:
1040
+ return
1041
+
1042
+ source_is_file = len(lpaths) == 1
1043
+ dest_is_dir = isinstance(rpath, str) and (
1044
+ trailing_sep(rpath) or self.isdir(rpath)
1045
+ )
1046
+
1047
+ rpath = (
1048
+ self._strip_protocol(rpath)
1049
+ if isinstance(rpath, str)
1050
+ else [self._strip_protocol(p) for p in rpath]
1051
+ )
1052
+ exists = source_is_str and (
1053
+ (has_magic(lpath) and source_is_file)
1054
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
1055
+ )
1056
+ rpaths = other_paths(
1057
+ lpaths,
1058
+ rpath,
1059
+ exists=exists,
1060
+ flatten=not source_is_str,
1061
+ )
1062
+
1063
+ callback.set_size(len(rpaths))
1064
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
1065
+ with callback.branched(lpath, rpath) as child:
1066
+ self.put_file(lpath, rpath, callback=child, **kwargs)
1067
+
1068
+ def head(self, path, size=1024):
1069
+ """Get the first ``size`` bytes from file"""
1070
+ with self.open(path, "rb") as f:
1071
+ return f.read(size)
1072
+
1073
+ def tail(self, path, size=1024):
1074
+ """Get the last ``size`` bytes from file"""
1075
+ with self.open(path, "rb") as f:
1076
+ f.seek(max(-size, -f.size), 2)
1077
+ return f.read()
1078
+
1079
+ def cp_file(self, path1, path2, **kwargs):
1080
+ raise NotImplementedError
1081
+
1082
+ def copy(
1083
+ self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
1084
+ ):
1085
+ """Copy within two locations in the filesystem
1086
+
1087
+ on_error : "raise", "ignore"
1088
+ If raise, any not-found exceptions will be raised; if ignore any
1089
+ not-found exceptions will cause the path to be skipped; defaults to
1090
+ raise unless recursive is true, where the default is ignore
1091
+ """
1092
+ if on_error is None and recursive:
1093
+ on_error = "ignore"
1094
+ elif on_error is None:
1095
+ on_error = "raise"
1096
+
1097
+ if isinstance(path1, list) and isinstance(path2, list):
1098
+ # No need to expand paths when both source and destination
1099
+ # are provided as lists
1100
+ paths1 = path1
1101
+ paths2 = path2
1102
+ else:
1103
+ from .implementations.local import trailing_sep
1104
+
1105
+ source_is_str = isinstance(path1, str)
1106
+ paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
1107
+ if source_is_str and (not recursive or maxdepth is not None):
1108
+ # Non-recursive glob does not copy directories
1109
+ paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
1110
+ if not paths1:
1111
+ return
1112
+
1113
+ source_is_file = len(paths1) == 1
1114
+ dest_is_dir = isinstance(path2, str) and (
1115
+ trailing_sep(path2) or self.isdir(path2)
1116
+ )
1117
+
1118
+ exists = source_is_str and (
1119
+ (has_magic(path1) and source_is_file)
1120
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
1121
+ )
1122
+ paths2 = other_paths(
1123
+ paths1,
1124
+ path2,
1125
+ exists=exists,
1126
+ flatten=not source_is_str,
1127
+ )
1128
+
1129
+ for p1, p2 in zip(paths1, paths2):
1130
+ try:
1131
+ self.cp_file(p1, p2, **kwargs)
1132
+ except FileNotFoundError:
1133
+ if on_error == "raise":
1134
+ raise
1135
+
1136
+ def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
1137
+ """Turn one or more globs or directories into a list of all matching paths
1138
+ to files or directories.
1139
+
1140
+ kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
1141
+ """
1142
+
1143
+ if maxdepth is not None and maxdepth < 1:
1144
+ raise ValueError("maxdepth must be at least 1")
1145
+
1146
+ if isinstance(path, (str, os.PathLike)):
1147
+ out = self.expand_path([path], recursive, maxdepth)
1148
+ else:
1149
+ out = set()
1150
+ path = [self._strip_protocol(p) for p in path]
1151
+ for p in path:
1152
+ if has_magic(p):
1153
+ bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
1154
+ out |= bit
1155
+ if recursive:
1156
+ # glob call above expanded one depth so if maxdepth is defined
1157
+ # then decrement it in expand_path call below. If it is zero
1158
+ # after decrementing then avoid expand_path call.
1159
+ if maxdepth is not None and maxdepth <= 1:
1160
+ continue
1161
+ out |= set(
1162
+ self.expand_path(
1163
+ list(bit),
1164
+ recursive=recursive,
1165
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
1166
+ **kwargs,
1167
+ )
1168
+ )
1169
+ continue
1170
+ elif recursive:
1171
+ rec = set(
1172
+ self.find(
1173
+ p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
1174
+ )
1175
+ )
1176
+ out |= rec
1177
+ if p not in out and (recursive is False or self.exists(p)):
1178
+ # should only check once, for the root
1179
+ out.add(p)
1180
+ if not out:
1181
+ raise FileNotFoundError(path)
1182
+ return sorted(out)
1183
+
1184
+ def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
1185
+ """Move file(s) from one location to another"""
1186
+ if path1 == path2:
1187
+ logger.debug("%s mv: The paths are the same, so no files were moved.", self)
1188
+ else:
1189
+ # explicitly raise exception to prevent data corruption
1190
+ self.copy(
1191
+ path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
1192
+ )
1193
+ self.rm(path1, recursive=recursive)
1194
+
1195
+ def rm_file(self, path):
1196
+ """Delete a file"""
1197
+ self._rm(path)
1198
+
1199
+ def _rm(self, path):
1200
+ """Delete one file"""
1201
+ # this is the old name for the method, prefer rm_file
1202
+ raise NotImplementedError
1203
+
1204
+ def rm(self, path, recursive=False, maxdepth=None):
1205
+ """Delete files.
1206
+
1207
+ Parameters
1208
+ ----------
1209
+ path: str or list of str
1210
+ File(s) to delete.
1211
+ recursive: bool
1212
+ If file(s) are directories, recursively delete contents and then
1213
+ also remove the directory
1214
+ maxdepth: int or None
1215
+ Depth to pass to walk for finding files to delete, if recursive.
1216
+ If None, there will be no limit and infinite recursion may be
1217
+ possible.
1218
+ """
1219
+ path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
1220
+ for p in reversed(path):
1221
+ self.rm_file(p)
1222
+
1223
+ @classmethod
1224
+ def _parent(cls, path):
1225
+ path = cls._strip_protocol(path)
1226
+ if "/" in path:
1227
+ parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
1228
+ return cls.root_marker + parent
1229
+ else:
1230
+ return cls.root_marker
1231
+
1232
+ def _open(
1233
+ self,
1234
+ path,
1235
+ mode="rb",
1236
+ block_size=None,
1237
+ autocommit=True,
1238
+ cache_options=None,
1239
+ **kwargs,
1240
+ ):
1241
+ """Return raw bytes-mode file-like from the file-system"""
1242
+ return AbstractBufferedFile(
1243
+ self,
1244
+ path,
1245
+ mode,
1246
+ block_size,
1247
+ autocommit,
1248
+ cache_options=cache_options,
1249
+ **kwargs,
1250
+ )
1251
+
1252
+ def open(
1253
+ self,
1254
+ path,
1255
+ mode="rb",
1256
+ block_size=None,
1257
+ cache_options=None,
1258
+ compression=None,
1259
+ **kwargs,
1260
+ ):
1261
+ """
1262
+ Return a file-like object from the filesystem
1263
+
1264
+ The resultant instance must function correctly in a context ``with``
1265
+ block.
1266
+
1267
+ Parameters
1268
+ ----------
1269
+ path: str
1270
+ Target file
1271
+ mode: str like 'rb', 'w'
1272
+ See builtin ``open()``
1273
+ Mode "x" (exclusive write) may be implemented by the backend. Even if
1274
+ it is, whether it is checked up front or on commit, and whether it is
1275
+ atomic is implementation-dependent.
1276
+ block_size: int
1277
+ Some indication of buffering - this is a value in bytes
1278
+ cache_options : dict, optional
1279
+ Extra arguments to pass through to the cache.
1280
+ compression: string or None
1281
+ If given, open file using compression codec. Can either be a compression
1282
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
1283
+ compression from the filename suffix.
1284
+ encoding, errors, newline: passed on to TextIOWrapper for text mode
1285
+ """
1286
+ import io
1287
+
1288
+ path = self._strip_protocol(path)
1289
+ if "b" not in mode:
1290
+ mode = mode.replace("t", "") + "b"
1291
+
1292
+ text_kwargs = {
1293
+ k: kwargs.pop(k)
1294
+ for k in ["encoding", "errors", "newline"]
1295
+ if k in kwargs
1296
+ }
1297
+ return io.TextIOWrapper(
1298
+ self.open(
1299
+ path,
1300
+ mode,
1301
+ block_size=block_size,
1302
+ cache_options=cache_options,
1303
+ compression=compression,
1304
+ **kwargs,
1305
+ ),
1306
+ **text_kwargs,
1307
+ )
1308
+ else:
1309
+ ac = kwargs.pop("autocommit", not self._intrans)
1310
+ f = self._open(
1311
+ path,
1312
+ mode=mode,
1313
+ block_size=block_size,
1314
+ autocommit=ac,
1315
+ cache_options=cache_options,
1316
+ **kwargs,
1317
+ )
1318
+ if compression is not None:
1319
+ from fsspec.compression import compr
1320
+ from fsspec.core import get_compression
1321
+
1322
+ compression = get_compression(path, compression)
1323
+ compress = compr[compression]
1324
+ f = compress(f, mode=mode[0])
1325
+
1326
+ if not ac and "r" not in mode:
1327
+ self.transaction.files.append(f)
1328
+ return f
1329
+
1330
+ def touch(self, path, truncate=True, **kwargs):
1331
+ """Create empty file, or update timestamp
1332
+
1333
+ Parameters
1334
+ ----------
1335
+ path: str
1336
+ file location
1337
+ truncate: bool
1338
+ If True, always set file size to 0; if False, update timestamp and
1339
+ leave file unchanged, if backend allows this
1340
+ """
1341
+ if truncate or not self.exists(path):
1342
+ with self.open(path, "wb", **kwargs):
1343
+ pass
1344
+ else:
1345
+ raise NotImplementedError # update timestamp, if possible
1346
+
1347
+ def ukey(self, path):
1348
+ """Hash of file properties, to tell if it has changed"""
1349
+ return sha256(str(self.info(path)).encode()).hexdigest()
1350
+
1351
+ def read_block(self, fn, offset, length, delimiter=None):
1352
+ """Read a block of bytes from
1353
+
1354
+ Starting at ``offset`` of the file, read ``length`` bytes. If
1355
+ ``delimiter`` is set then we ensure that the read starts and stops at
1356
+ delimiter boundaries that follow the locations ``offset`` and ``offset
1357
+ + length``. If ``offset`` is zero then we start at zero. The
1358
+ bytestring returned WILL include the end delimiter string.
1359
+
1360
+ If offset+length is beyond the eof, reads to eof.
1361
+
1362
+ Parameters
1363
+ ----------
1364
+ fn: string
1365
+ Path to filename
1366
+ offset: int
1367
+ Byte offset to start read
1368
+ length: int
1369
+ Number of bytes to read. If None, read to end.
1370
+ delimiter: bytes (optional)
1371
+ Ensure reading starts and stops at delimiter bytestring
1372
+
1373
+ Examples
1374
+ --------
1375
+ >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
1376
+ b'Alice, 100\\nBo'
1377
+ >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
1378
+ b'Alice, 100\\nBob, 200\\n'
1379
+
1380
+ Use ``length=None`` to read to the end of the file.
1381
+ >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
1382
+ b'Alice, 100\\nBob, 200\\nCharlie, 300'
1383
+
1384
+ See Also
1385
+ --------
1386
+ :func:`fsspec.utils.read_block`
1387
+ """
1388
+ with self.open(fn, "rb") as f:
1389
+ size = f.size
1390
+ if length is None:
1391
+ length = size
1392
+ if size is not None and offset + length > size:
1393
+ length = size - offset
1394
+ return read_block(f, offset, length, delimiter)
1395
+
1396
+ def to_json(self, *, include_password: bool = True) -> str:
1397
+ """
1398
+ JSON representation of this filesystem instance.
1399
+
1400
+ Parameters
1401
+ ----------
1402
+ include_password: bool, default True
1403
+ Whether to include the password (if any) in the output.
1404
+
1405
+ Returns
1406
+ -------
1407
+ JSON string with keys ``cls`` (the python location of this class),
1408
+ protocol (text name of this class's protocol, first one in case of
1409
+ multiple), ``args`` (positional args, usually empty), and all other
1410
+ keyword arguments as their own keys.
1411
+
1412
+ Warnings
1413
+ --------
1414
+ Serialized filesystems may contain sensitive information which have been
1415
+ passed to the constructor, such as passwords and tokens. Make sure you
1416
+ store and send them in a secure environment!
1417
+ """
1418
+ from .json import FilesystemJSONEncoder
1419
+
1420
+ return json.dumps(
1421
+ self,
1422
+ cls=type(
1423
+ "_FilesystemJSONEncoder",
1424
+ (FilesystemJSONEncoder,),
1425
+ {"include_password": include_password},
1426
+ ),
1427
+ )
1428
+
1429
+ @staticmethod
1430
+ def from_json(blob: str) -> AbstractFileSystem:
1431
+ """
1432
+ Recreate a filesystem instance from JSON representation.
1433
+
1434
+ See ``.to_json()`` for the expected structure of the input.
1435
+
1436
+ Parameters
1437
+ ----------
1438
+ blob: str
1439
+
1440
+ Returns
1441
+ -------
1442
+ file system instance, not necessarily of this particular class.
1443
+
1444
+ Warnings
1445
+ --------
1446
+ This can import arbitrary modules (as determined by the ``cls`` key).
1447
+ Make sure you haven't installed any modules that may execute malicious code
1448
+ at import time.
1449
+ """
1450
+ from .json import FilesystemJSONDecoder
1451
+
1452
+ return json.loads(blob, cls=FilesystemJSONDecoder)
1453
+
1454
+ def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
1455
+ """
1456
+ JSON-serializable dictionary representation of this filesystem instance.
1457
+
1458
+ Parameters
1459
+ ----------
1460
+ include_password: bool, default True
1461
+ Whether to include the password (if any) in the output.
1462
+
1463
+ Returns
1464
+ -------
1465
+ Dictionary with keys ``cls`` (the python location of this class),
1466
+ protocol (text name of this class's protocol, first one in case of
1467
+ multiple), ``args`` (positional args, usually empty), and all other
1468
+ keyword arguments as their own keys.
1469
+
1470
+ Warnings
1471
+ --------
1472
+ Serialized filesystems may contain sensitive information which have been
1473
+ passed to the constructor, such as passwords and tokens. Make sure you
1474
+ store and send them in a secure environment!
1475
+ """
1476
+ from .json import FilesystemJSONEncoder
1477
+
1478
+ json_encoder = FilesystemJSONEncoder()
1479
+
1480
+ cls = type(self)
1481
+ proto = self.protocol
1482
+
1483
+ storage_options = dict(self.storage_options)
1484
+ if not include_password:
1485
+ storage_options.pop("password", None)
1486
+
1487
+ return dict(
1488
+ cls=f"{cls.__module__}:{cls.__name__}",
1489
+ protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
1490
+ args=json_encoder.make_serializable(self.storage_args),
1491
+ **json_encoder.make_serializable(storage_options),
1492
+ )
1493
+
1494
+ @staticmethod
1495
+ def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
1496
+ """
1497
+ Recreate a filesystem instance from dictionary representation.
1498
+
1499
+ See ``.to_dict()`` for the expected structure of the input.
1500
+
1501
+ Parameters
1502
+ ----------
1503
+ dct: Dict[str, Any]
1504
+
1505
+ Returns
1506
+ -------
1507
+ file system instance, not necessarily of this particular class.
1508
+
1509
+ Warnings
1510
+ --------
1511
+ This can import arbitrary modules (as determined by the ``cls`` key).
1512
+ Make sure you haven't installed any modules that may execute malicious code
1513
+ at import time.
1514
+ """
1515
+ from .json import FilesystemJSONDecoder
1516
+
1517
+ json_decoder = FilesystemJSONDecoder()
1518
+
1519
+ dct = dict(dct) # Defensive copy
1520
+
1521
+ cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
1522
+ if cls is None:
1523
+ raise ValueError("Not a serialized AbstractFileSystem")
1524
+
1525
+ dct.pop("cls", None)
1526
+ dct.pop("protocol", None)
1527
+
1528
+ return cls(
1529
+ *json_decoder.unmake_serializable(dct.pop("args", ())),
1530
+ **json_decoder.unmake_serializable(dct),
1531
+ )
1532
+
1533
+ def _get_pyarrow_filesystem(self):
1534
+ """
1535
+ Make a version of the FS instance which will be acceptable to pyarrow
1536
+ """
1537
+ # all instances already also derive from pyarrow
1538
+ return self
1539
+
1540
+ def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
1541
+ """Create key/value store based on this file-system
1542
+
1543
+ Makes a MutableMapping interface to the FS at the given root path.
1544
+ See ``fsspec.mapping.FSMap`` for further details.
1545
+ """
1546
+ from .mapping import FSMap
1547
+
1548
+ return FSMap(
1549
+ root,
1550
+ self,
1551
+ check=check,
1552
+ create=create,
1553
+ missing_exceptions=missing_exceptions,
1554
+ )
1555
+
1556
+ @classmethod
1557
+ def clear_instance_cache(cls):
1558
+ """
1559
+ Clear the cache of filesystem instances.
1560
+
1561
+ Notes
1562
+ -----
1563
+ Unless overridden by setting the ``cachable`` class attribute to False,
1564
+ the filesystem class stores a reference to newly created instances. This
1565
+ prevents Python's normal rules around garbage collection from working,
1566
+ since the instances refcount will not drop to zero until
1567
+ ``clear_instance_cache`` is called.
1568
+ """
1569
+ cls._cache.clear()
1570
+
1571
+ def created(self, path):
1572
+ """Return the created timestamp of a file as a datetime.datetime"""
1573
+ raise NotImplementedError
1574
+
1575
+ def modified(self, path):
1576
+ """Return the modified timestamp of a file as a datetime.datetime"""
1577
+ raise NotImplementedError
1578
+
1579
+ def tree(
1580
+ self,
1581
+ path: str = "/",
1582
+ recursion_limit: int = 2,
1583
+ max_display: int = 25,
1584
+ display_size: bool = False,
1585
+ prefix: str = "",
1586
+ is_last: bool = True,
1587
+ first: bool = True,
1588
+ indent_size: int = 4,
1589
+ ) -> str:
1590
+ """
1591
+ Return a tree-like structure of the filesystem starting from the given path as a string.
1592
+
1593
+ Parameters
1594
+ ----------
1595
+ path: Root path to start traversal from
1596
+ recursion_limit: Maximum depth of directory traversal
1597
+ max_display: Maximum number of items to display per directory
1598
+ display_size: Whether to display file sizes
1599
+ prefix: Current line prefix for visual tree structure
1600
+ is_last: Whether current item is last in its level
1601
+ first: Whether this is the first call (displays root path)
1602
+ indent_size: Number of spaces by indent
1603
+
1604
+ Returns
1605
+ -------
1606
+ str: A string representing the tree structure.
1607
+
1608
+ Example
1609
+ -------
1610
+ >>> from fsspec import filesystem
1611
+
1612
+ >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
1613
+ >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
1614
+ >>> print(tree)
1615
+ """
1616
+
1617
+ def format_bytes(n: int) -> str:
1618
+ """Format bytes as text."""
1619
+ for prefix, k in (
1620
+ ("P", 2**50),
1621
+ ("T", 2**40),
1622
+ ("G", 2**30),
1623
+ ("M", 2**20),
1624
+ ("k", 2**10),
1625
+ ):
1626
+ if n >= 0.9 * k:
1627
+ return f"{n / k:.2f} {prefix}b"
1628
+ return f"{n}B"
1629
+
1630
+ result = []
1631
+
1632
+ if first:
1633
+ result.append(path)
1634
+
1635
+ if recursion_limit:
1636
+ indent = " " * indent_size
1637
+ contents = self.ls(path, detail=True)
1638
+ contents.sort(
1639
+ key=lambda x: (x.get("type") != "directory", x.get("name", ""))
1640
+ )
1641
+
1642
+ if max_display is not None and len(contents) > max_display:
1643
+ displayed_contents = contents[:max_display]
1644
+ remaining_count = len(contents) - max_display
1645
+ else:
1646
+ displayed_contents = contents
1647
+ remaining_count = 0
1648
+
1649
+ for i, item in enumerate(displayed_contents):
1650
+ is_last_item = (i == len(displayed_contents) - 1) and (
1651
+ remaining_count == 0
1652
+ )
1653
+
1654
+ branch = (
1655
+ "└" + ("─" * (indent_size - 2))
1656
+ if is_last_item
1657
+ else "├" + ("─" * (indent_size - 2))
1658
+ )
1659
+ branch += " "
1660
+ new_prefix = prefix + (
1661
+ indent if is_last_item else "│" + " " * (indent_size - 1)
1662
+ )
1663
+
1664
+ name = os.path.basename(item.get("name", ""))
1665
+
1666
+ if display_size and item.get("type") == "directory":
1667
+ sub_contents = self.ls(item.get("name", ""), detail=True)
1668
+ num_files = sum(
1669
+ 1 for sub_item in sub_contents if sub_item.get("type") == "file"
1670
+ )
1671
+ num_folders = sum(
1672
+ 1
1673
+ for sub_item in sub_contents
1674
+ if sub_item.get("type") == "directory"
1675
+ )
1676
+
1677
+ if num_files == 0 and num_folders == 0:
1678
+ size = " (empty folder)"
1679
+ elif num_files == 0:
1680
+ size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
1681
+ elif num_folders == 0:
1682
+ size = f" ({num_files} file{'s' if num_files > 1 else ''})"
1683
+ else:
1684
+ size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
1685
+ elif display_size and item.get("type") == "file":
1686
+ size = f" ({format_bytes(item.get('size', 0))})"
1687
+ else:
1688
+ size = ""
1689
+
1690
+ result.append(f"{prefix}{branch}{name}{size}")
1691
+
1692
+ if item.get("type") == "directory" and recursion_limit > 0:
1693
+ result.append(
1694
+ self.tree(
1695
+ path=item.get("name", ""),
1696
+ recursion_limit=recursion_limit - 1,
1697
+ max_display=max_display,
1698
+ display_size=display_size,
1699
+ prefix=new_prefix,
1700
+ is_last=is_last_item,
1701
+ first=False,
1702
+ indent_size=indent_size,
1703
+ )
1704
+ )
1705
+
1706
+ if remaining_count > 0:
1707
+ more_message = f"{remaining_count} more item(s) not displayed."
1708
+ result.append(
1709
+ f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
1710
+ )
1711
+
1712
+ return "\n".join(_ for _ in result if _)
1713
+
1714
+ # ------------------------------------------------------------------------
1715
+ # Aliases
1716
+
1717
+ def read_bytes(self, path, start=None, end=None, **kwargs):
1718
+ """Alias of `AbstractFileSystem.cat_file`."""
1719
+ return self.cat_file(path, start=start, end=end, **kwargs)
1720
+
1721
+ def write_bytes(self, path, value, **kwargs):
1722
+ """Alias of `AbstractFileSystem.pipe_file`."""
1723
+ self.pipe_file(path, value, **kwargs)
1724
+
1725
+ def makedir(self, path, create_parents=True, **kwargs):
1726
+ """Alias of `AbstractFileSystem.mkdir`."""
1727
+ return self.mkdir(path, create_parents=create_parents, **kwargs)
1728
+
1729
+ def mkdirs(self, path, exist_ok=False):
1730
+ """Alias of `AbstractFileSystem.makedirs`."""
1731
+ return self.makedirs(path, exist_ok=exist_ok)
1732
+
1733
+ def listdir(self, path, detail=True, **kwargs):
1734
+ """Alias of `AbstractFileSystem.ls`."""
1735
+ return self.ls(path, detail=detail, **kwargs)
1736
+
1737
+ def cp(self, path1, path2, **kwargs):
1738
+ """Alias of `AbstractFileSystem.copy`."""
1739
+ return self.copy(path1, path2, **kwargs)
1740
+
1741
+ def move(self, path1, path2, **kwargs):
1742
+ """Alias of `AbstractFileSystem.mv`."""
1743
+ return self.mv(path1, path2, **kwargs)
1744
+
1745
+ def stat(self, path, **kwargs):
1746
+ """Alias of `AbstractFileSystem.info`."""
1747
+ return self.info(path, **kwargs)
1748
+
1749
+ def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
1750
+ """Alias of `AbstractFileSystem.du`."""
1751
+ return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
1752
+
1753
+ def rename(self, path1, path2, **kwargs):
1754
+ """Alias of `AbstractFileSystem.mv`."""
1755
+ return self.mv(path1, path2, **kwargs)
1756
+
1757
+ def delete(self, path, recursive=False, maxdepth=None):
1758
+ """Alias of `AbstractFileSystem.rm`."""
1759
+ return self.rm(path, recursive=recursive, maxdepth=maxdepth)
1760
+
1761
+ def upload(self, lpath, rpath, recursive=False, **kwargs):
1762
+ """Alias of `AbstractFileSystem.put`."""
1763
+ return self.put(lpath, rpath, recursive=recursive, **kwargs)
1764
+
1765
+ def download(self, rpath, lpath, recursive=False, **kwargs):
1766
+ """Alias of `AbstractFileSystem.get`."""
1767
+ return self.get(rpath, lpath, recursive=recursive, **kwargs)
1768
+
1769
+ def sign(self, path, expiration=100, **kwargs):
1770
+ """Create a signed URL representing the given path
1771
+
1772
+ Some implementations allow temporary URLs to be generated, as a
1773
+ way of delegating credentials.
1774
+
1775
+ Parameters
1776
+ ----------
1777
+ path : str
1778
+ The path on the filesystem
1779
+ expiration : int
1780
+ Number of seconds to enable the URL for (if supported)
1781
+
1782
+ Returns
1783
+ -------
1784
+ URL : str
1785
+ The signed URL
1786
+
1787
+ Raises
1788
+ ------
1789
+ NotImplementedError : if method is not implemented for a filesystem
1790
+ """
1791
+ raise NotImplementedError("Sign is not implemented for this filesystem")
1792
+
1793
+ def _isfilestore(self):
1794
+ # Originally inherited from pyarrow DaskFileSystem. Keeping this
1795
+ # here for backwards compatibility as long as pyarrow uses its
1796
+ # legacy fsspec-compatible filesystems and thus accepts fsspec
1797
+ # filesystems as well
1798
+ return False
1799
+
1800
+
1801
+ class AbstractBufferedFile(io.IOBase):
1802
+ """Convenient class to derive from to provide buffering
1803
+
1804
+ In the case that the backend does not provide a pythonic file-like object
1805
+ already, this class contains much of the logic to build one. The only
1806
+ methods that need to be overridden are ``_upload_chunk``,
1807
+ ``_initiate_upload`` and ``_fetch_range``.
1808
+ """
1809
+
1810
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
1811
+ _details = None
1812
+
1813
+ def __init__(
1814
+ self,
1815
+ fs,
1816
+ path,
1817
+ mode="rb",
1818
+ block_size="default",
1819
+ autocommit=True,
1820
+ cache_type="readahead",
1821
+ cache_options=None,
1822
+ size=None,
1823
+ **kwargs,
1824
+ ):
1825
+ """
1826
+ Template for files with buffered reading and writing
1827
+
1828
+ Parameters
1829
+ ----------
1830
+ fs: instance of FileSystem
1831
+ path: str
1832
+ location in file-system
1833
+ mode: str
1834
+ Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
1835
+ systems may be read-only, and some may not support append.
1836
+ block_size: int
1837
+ Buffer size for reading or writing, 'default' for class default
1838
+ autocommit: bool
1839
+ Whether to write to final destination; may only impact what
1840
+ happens when file is being closed.
1841
+ cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
1842
+ Caching policy in read mode. See the definitions in ``core``.
1843
+ cache_options : dict
1844
+ Additional options passed to the constructor for the cache specified
1845
+ by `cache_type`.
1846
+ size: int
1847
+ If given and in read mode, suppressed having to look up the file size
1848
+ kwargs:
1849
+ Gets stored as self.kwargs
1850
+ """
1851
+ from .core import caches
1852
+
1853
+ self.path = path
1854
+ self.fs = fs
1855
+ self.mode = mode
1856
+ self.blocksize = (
1857
+ self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
1858
+ )
1859
+ self.loc = 0
1860
+ self.autocommit = autocommit
1861
+ self.end = None
1862
+ self.start = None
1863
+ self.closed = False
1864
+
1865
+ if cache_options is None:
1866
+ cache_options = {}
1867
+
1868
+ if "trim" in kwargs:
1869
+ warnings.warn(
1870
+ "Passing 'trim' to control the cache behavior has been deprecated. "
1871
+ "Specify it within the 'cache_options' argument instead.",
1872
+ FutureWarning,
1873
+ )
1874
+ cache_options["trim"] = kwargs.pop("trim")
1875
+
1876
+ self.kwargs = kwargs
1877
+
1878
+ if mode not in {"ab", "rb", "wb", "xb"}:
1879
+ raise NotImplementedError("File mode not supported")
1880
+ if mode == "rb":
1881
+ if size is not None:
1882
+ self.size = size
1883
+ else:
1884
+ self.size = self.details["size"]
1885
+ self.cache = caches[cache_type](
1886
+ self.blocksize, self._fetch_range, self.size, **cache_options
1887
+ )
1888
+ else:
1889
+ self.buffer = io.BytesIO()
1890
+ self.offset = None
1891
+ self.forced = False
1892
+ self.location = None
1893
+
1894
+ @property
1895
+ def details(self):
1896
+ if self._details is None:
1897
+ self._details = self.fs.info(self.path)
1898
+ return self._details
1899
+
1900
+ @details.setter
1901
+ def details(self, value):
1902
+ self._details = value
1903
+ self.size = value["size"]
1904
+
1905
+ @property
1906
+ def full_name(self):
1907
+ return _unstrip_protocol(self.path, self.fs)
1908
+
1909
+ @property
1910
+ def closed(self):
1911
+ # get around this attr being read-only in IOBase
1912
+ # use getattr here, since this can be called during del
1913
+ return getattr(self, "_closed", True)
1914
+
1915
+ @closed.setter
1916
+ def closed(self, c):
1917
+ self._closed = c
1918
+
1919
+ def __hash__(self):
1920
+ if "w" in self.mode:
1921
+ return id(self)
1922
+ else:
1923
+ return int(tokenize(self.details), 16)
1924
+
1925
+ def __eq__(self, other):
1926
+ """Files are equal if they have the same checksum, only in read mode"""
1927
+ if self is other:
1928
+ return True
1929
+ return (
1930
+ isinstance(other, type(self))
1931
+ and self.mode == "rb"
1932
+ and other.mode == "rb"
1933
+ and hash(self) == hash(other)
1934
+ )
1935
+
1936
+ def commit(self):
1937
+ """Move from temp to final destination"""
1938
+
1939
+ def discard(self):
1940
+ """Throw away temporary file"""
1941
+
1942
+ def info(self):
1943
+ """File information about this path"""
1944
+ if self.readable():
1945
+ return self.details
1946
+ else:
1947
+ raise ValueError("Info not available while writing")
1948
+
1949
+ def tell(self):
1950
+ """Current file location"""
1951
+ return self.loc
1952
+
1953
+ def seek(self, loc, whence=0):
1954
+ """Set current file location
1955
+
1956
+ Parameters
1957
+ ----------
1958
+ loc: int
1959
+ byte location
1960
+ whence: {0, 1, 2}
1961
+ from start of file, current location or end of file, resp.
1962
+ """
1963
+ loc = int(loc)
1964
+ if not self.mode == "rb":
1965
+ raise OSError(ESPIPE, "Seek only available in read mode")
1966
+ if whence == 0:
1967
+ nloc = loc
1968
+ elif whence == 1:
1969
+ nloc = self.loc + loc
1970
+ elif whence == 2:
1971
+ nloc = self.size + loc
1972
+ else:
1973
+ raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
1974
+ if nloc < 0:
1975
+ raise ValueError("Seek before start of file")
1976
+ self.loc = nloc
1977
+ return self.loc
1978
+
1979
+ def write(self, data):
1980
+ """
1981
+ Write data to buffer.
1982
+
1983
+ Buffer only sent on flush() or if buffer is greater than
1984
+ or equal to blocksize.
1985
+
1986
+ Parameters
1987
+ ----------
1988
+ data: bytes
1989
+ Set of bytes to be written.
1990
+ """
1991
+ if not self.writable():
1992
+ raise ValueError("File not in write mode")
1993
+ if self.closed:
1994
+ raise ValueError("I/O operation on closed file.")
1995
+ if self.forced:
1996
+ raise ValueError("This file has been force-flushed, can only close")
1997
+ out = self.buffer.write(data)
1998
+ self.loc += out
1999
+ if self.buffer.tell() >= self.blocksize:
2000
+ self.flush()
2001
+ return out
2002
+
2003
+ def flush(self, force=False):
2004
+ """
2005
+ Write buffered data to backend store.
2006
+
2007
+ Writes the current buffer, if it is larger than the block-size, or if
2008
+ the file is being closed.
2009
+
2010
+ Parameters
2011
+ ----------
2012
+ force: bool
2013
+ When closing, write the last block even if it is smaller than
2014
+ blocks are allowed to be. Disallows further writing to this file.
2015
+ """
2016
+
2017
+ if self.closed:
2018
+ raise ValueError("Flush on closed file")
2019
+ if force and self.forced:
2020
+ raise ValueError("Force flush cannot be called more than once")
2021
+ if force:
2022
+ self.forced = True
2023
+
2024
+ if self.readable():
2025
+ # no-op to flush on read-mode
2026
+ return
2027
+
2028
+ if not force and self.buffer.tell() < self.blocksize:
2029
+ # Defer write on small block
2030
+ return
2031
+
2032
+ if self.offset is None:
2033
+ # Initialize a multipart upload
2034
+ self.offset = 0
2035
+ try:
2036
+ self._initiate_upload()
2037
+ except:
2038
+ self.closed = True
2039
+ raise
2040
+
2041
+ if self._upload_chunk(final=force) is not False:
2042
+ self.offset += self.buffer.seek(0, 2)
2043
+ self.buffer = io.BytesIO()
2044
+
2045
+ def _upload_chunk(self, final=False):
2046
+ """Write one part of a multi-block file upload
2047
+
2048
+ Parameters
2049
+ ==========
2050
+ final: bool
2051
+ This is the last block, so should complete file, if
2052
+ self.autocommit is True.
2053
+ """
2054
+ # may not yet have been initialized, may need to call _initialize_upload
2055
+
2056
+ def _initiate_upload(self):
2057
+ """Create remote file/upload"""
2058
+ pass
2059
+
2060
+ def _fetch_range(self, start, end):
2061
+ """Get the specified set of bytes from remote"""
2062
+ return self.fs.cat_file(self.path, start=start, end=end)
2063
+
2064
+ def read(self, length=-1):
2065
+ """
2066
+ Return data from cache, or fetch pieces as necessary
2067
+
2068
+ Parameters
2069
+ ----------
2070
+ length: int (-1)
2071
+ Number of bytes to read; if <0, all remaining bytes.
2072
+ """
2073
+ length = -1 if length is None else int(length)
2074
+ if self.mode != "rb":
2075
+ raise ValueError("File not in read mode")
2076
+ if length < 0:
2077
+ length = self.size - self.loc
2078
+ if self.closed:
2079
+ raise ValueError("I/O operation on closed file.")
2080
+ if length == 0:
2081
+ # don't even bother calling fetch
2082
+ return b""
2083
+ out = self.cache._fetch(self.loc, self.loc + length)
2084
+
2085
+ logger.debug(
2086
+ "%s read: %i - %i %s",
2087
+ self,
2088
+ self.loc,
2089
+ self.loc + length,
2090
+ self.cache._log_stats(),
2091
+ )
2092
+ self.loc += len(out)
2093
+ return out
2094
+
2095
+ def readinto(self, b):
2096
+ """mirrors builtin file's readinto method
2097
+
2098
+ https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
2099
+ """
2100
+ out = memoryview(b).cast("B")
2101
+ data = self.read(out.nbytes)
2102
+ out[: len(data)] = data
2103
+ return len(data)
2104
+
2105
+ def readuntil(self, char=b"\n", blocks=None):
2106
+ """Return data between current position and first occurrence of char
2107
+
2108
+ char is included in the output, except if the end of the tile is
2109
+ encountered first.
2110
+
2111
+ Parameters
2112
+ ----------
2113
+ char: bytes
2114
+ Thing to find
2115
+ blocks: None or int
2116
+ How much to read in each go. Defaults to file blocksize - which may
2117
+ mean a new read on every call.
2118
+ """
2119
+ out = []
2120
+ while True:
2121
+ start = self.tell()
2122
+ part = self.read(blocks or self.blocksize)
2123
+ if len(part) == 0:
2124
+ break
2125
+ found = part.find(char)
2126
+ if found > -1:
2127
+ out.append(part[: found + len(char)])
2128
+ self.seek(start + found + len(char))
2129
+ break
2130
+ out.append(part)
2131
+ return b"".join(out)
2132
+
2133
+ def readline(self):
2134
+ """Read until and including the first occurrence of newline character
2135
+
2136
+ Note that, because of character encoding, this is not necessarily a
2137
+ true line ending.
2138
+ """
2139
+ return self.readuntil(b"\n")
2140
+
2141
+ def __next__(self):
2142
+ out = self.readline()
2143
+ if out:
2144
+ return out
2145
+ raise StopIteration
2146
+
2147
+ def __iter__(self):
2148
+ return self
2149
+
2150
+ def readlines(self):
2151
+ """Return all data, split by the newline character, including the newline character"""
2152
+ data = self.read()
2153
+ lines = data.split(b"\n")
2154
+ out = [l + b"\n" for l in lines[:-1]]
2155
+ if data.endswith(b"\n"):
2156
+ return out
2157
+ else:
2158
+ return out + [lines[-1]]
2159
+ # return list(self) ???
2160
+
2161
+ def readinto1(self, b):
2162
+ return self.readinto(b)
2163
+
2164
+ def close(self):
2165
+ """Close file
2166
+
2167
+ Finalizes writes, discards cache
2168
+ """
2169
+ if getattr(self, "_unclosable", False):
2170
+ return
2171
+ if self.closed:
2172
+ return
2173
+ try:
2174
+ if self.mode == "rb":
2175
+ self.cache = None
2176
+ else:
2177
+ if not self.forced:
2178
+ self.flush(force=True)
2179
+
2180
+ if self.fs is not None:
2181
+ self.fs.invalidate_cache(self.path)
2182
+ self.fs.invalidate_cache(self.fs._parent(self.path))
2183
+ finally:
2184
+ self.closed = True
2185
+
2186
+ def readable(self):
2187
+ """Whether opened for reading"""
2188
+ return "r" in self.mode and not self.closed
2189
+
2190
+ def seekable(self):
2191
+ """Whether is seekable (only in read mode)"""
2192
+ return self.readable()
2193
+
2194
+ def writable(self):
2195
+ """Whether opened for writing"""
2196
+ return self.mode in {"wb", "ab", "xb"} and not self.closed
2197
+
2198
+ def __reduce__(self):
2199
+ if self.mode != "rb":
2200
+ raise RuntimeError("Pickling a writeable file is not supported")
2201
+
2202
+ return reopen, (
2203
+ self.fs,
2204
+ self.path,
2205
+ self.mode,
2206
+ self.blocksize,
2207
+ self.loc,
2208
+ self.size,
2209
+ self.autocommit,
2210
+ self.cache.name if self.cache else "none",
2211
+ self.kwargs,
2212
+ )
2213
+
2214
+ def __del__(self):
2215
+ if not self.closed:
2216
+ self.close()
2217
+
2218
+ def __str__(self):
2219
+ return f"<File-like object {type(self.fs).__name__}, {self.path}>"
2220
+
2221
+ __repr__ = __str__
2222
+
2223
+ def __enter__(self):
2224
+ return self
2225
+
2226
+ def __exit__(self, *args):
2227
+ self.close()
2228
+
2229
+
2230
+ def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
2231
+ file = fs.open(
2232
+ path,
2233
+ mode=mode,
2234
+ block_size=blocksize,
2235
+ autocommit=autocommit,
2236
+ cache_type=cache_type,
2237
+ size=size,
2238
+ **kwargs,
2239
+ )
2240
+ if loc > 0:
2241
+ file.seek(loc)
2242
+ return file
.venv/lib/python3.11/site-packages/fsspec/transaction.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+
3
+
4
+ class Transaction:
5
+ """Filesystem transaction write context
6
+
7
+ Gathers files for deferred commit or discard, so that several write
8
+ operations can be finalized semi-atomically. This works by having this
9
+ instance as the ``.transaction`` attribute of the given filesystem
10
+ """
11
+
12
+ def __init__(self, fs, **kwargs):
13
+ """
14
+ Parameters
15
+ ----------
16
+ fs: FileSystem instance
17
+ """
18
+ self.fs = fs
19
+ self.files = deque()
20
+
21
+ def __enter__(self):
22
+ self.start()
23
+ return self
24
+
25
+ def __exit__(self, exc_type, exc_val, exc_tb):
26
+ """End transaction and commit, if exit is not due to exception"""
27
+ # only commit if there was no exception
28
+ self.complete(commit=exc_type is None)
29
+ if self.fs:
30
+ self.fs._intrans = False
31
+ self.fs._transaction = None
32
+ self.fs = None
33
+
34
+ def start(self):
35
+ """Start a transaction on this FileSystem"""
36
+ self.files = deque() # clean up after previous failed completions
37
+ self.fs._intrans = True
38
+
39
+ def complete(self, commit=True):
40
+ """Finish transaction: commit or discard all deferred files"""
41
+ while self.files:
42
+ f = self.files.popleft()
43
+ if commit:
44
+ f.commit()
45
+ else:
46
+ f.discard()
47
+ self.fs._intrans = False
48
+ self.fs._transaction = None
49
+ self.fs = None
50
+
51
+
52
+ class FileActor:
53
+ def __init__(self):
54
+ self.files = []
55
+
56
+ def commit(self):
57
+ for f in self.files:
58
+ f.commit()
59
+ self.files.clear()
60
+
61
+ def discard(self):
62
+ for f in self.files:
63
+ f.discard()
64
+ self.files.clear()
65
+
66
+ def append(self, f):
67
+ self.files.append(f)
68
+
69
+
70
+ class DaskTransaction(Transaction):
71
+ def __init__(self, fs):
72
+ """
73
+ Parameters
74
+ ----------
75
+ fs: FileSystem instance
76
+ """
77
+ import distributed
78
+
79
+ super().__init__(fs)
80
+ client = distributed.default_client()
81
+ self.files = client.submit(FileActor, actor=True).result()
82
+
83
+ def complete(self, commit=True):
84
+ """Finish transaction: commit or discard all deferred files"""
85
+ if commit:
86
+ self.files.commit().result()
87
+ else:
88
+ self.files.discard().result()
89
+ self.fs._intrans = False
90
+ self.fs = None
.venv/lib/python3.11/site-packages/fsspec/utils.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ import math
6
+ import os
7
+ import re
8
+ import sys
9
+ import tempfile
10
+ from functools import partial
11
+ from hashlib import md5
12
+ from importlib.metadata import version
13
+ from typing import (
14
+ IO,
15
+ TYPE_CHECKING,
16
+ Any,
17
+ Callable,
18
+ Iterable,
19
+ Iterator,
20
+ Sequence,
21
+ TypeVar,
22
+ )
23
+ from urllib.parse import urlsplit
24
+
25
+ if TYPE_CHECKING:
26
+ import pathlib
27
+
28
+ from typing_extensions import TypeGuard
29
+
30
+ from fsspec.spec import AbstractFileSystem
31
+
32
+
33
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
34
+
35
+ T = TypeVar("T")
36
+
37
+
38
+ def infer_storage_options(
39
+ urlpath: str, inherit_storage_options: dict[str, Any] | None = None
40
+ ) -> dict[str, Any]:
41
+ """Infer storage options from URL path and merge it with existing storage
42
+ options.
43
+
44
+ Parameters
45
+ ----------
46
+ urlpath: str or unicode
47
+ Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
48
+ inherit_storage_options: dict (optional)
49
+ Its contents will get merged with the inferred information from the
50
+ given path
51
+
52
+ Returns
53
+ -------
54
+ Storage options dict.
55
+
56
+ Examples
57
+ --------
58
+ >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
59
+ {"protocol": "file", "path", "/mnt/datasets/test.csv"}
60
+ >>> infer_storage_options(
61
+ ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
62
+ ... inherit_storage_options={'extra': 'value'},
63
+ ... ) # doctest: +SKIP
64
+ {"protocol": "hdfs", "username": "username", "password": "pwd",
65
+ "host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
66
+ "url_query": "q=1", "extra": "value"}
67
+ """
68
+ # Handle Windows paths including disk name in this special case
69
+ if (
70
+ re.match(r"^[a-zA-Z]:[\\/]", urlpath)
71
+ or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
72
+ ):
73
+ return {"protocol": "file", "path": urlpath}
74
+
75
+ parsed_path = urlsplit(urlpath)
76
+ protocol = parsed_path.scheme or "file"
77
+ if parsed_path.fragment:
78
+ path = "#".join([parsed_path.path, parsed_path.fragment])
79
+ else:
80
+ path = parsed_path.path
81
+ if protocol == "file":
82
+ # Special case parsing file protocol URL on Windows according to:
83
+ # https://msdn.microsoft.com/en-us/library/jj710207.aspx
84
+ windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
85
+ if windows_path:
86
+ drive, path = windows_path.groups()
87
+ path = f"{drive}:{path}"
88
+
89
+ if protocol in ["http", "https"]:
90
+ # for HTTP, we don't want to parse, as requests will anyway
91
+ return {"protocol": protocol, "path": urlpath}
92
+
93
+ options: dict[str, Any] = {"protocol": protocol, "path": path}
94
+
95
+ if parsed_path.netloc:
96
+ # Parse `hostname` from netloc manually because `parsed_path.hostname`
97
+ # lowercases the hostname which is not always desirable (e.g. in S3):
98
+ # https://github.com/dask/dask/issues/1417
99
+ options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
100
+
101
+ if protocol in ("s3", "s3a", "gcs", "gs"):
102
+ options["path"] = options["host"] + options["path"]
103
+ else:
104
+ options["host"] = options["host"]
105
+ if parsed_path.port:
106
+ options["port"] = parsed_path.port
107
+ if parsed_path.username:
108
+ options["username"] = parsed_path.username
109
+ if parsed_path.password:
110
+ options["password"] = parsed_path.password
111
+
112
+ if parsed_path.query:
113
+ options["url_query"] = parsed_path.query
114
+ if parsed_path.fragment:
115
+ options["url_fragment"] = parsed_path.fragment
116
+
117
+ if inherit_storage_options:
118
+ update_storage_options(options, inherit_storage_options)
119
+
120
+ return options
121
+
122
+
123
+ def update_storage_options(
124
+ options: dict[str, Any], inherited: dict[str, Any] | None = None
125
+ ) -> None:
126
+ if not inherited:
127
+ inherited = {}
128
+ collisions = set(options) & set(inherited)
129
+ if collisions:
130
+ for collision in collisions:
131
+ if options.get(collision) != inherited.get(collision):
132
+ raise KeyError(
133
+ f"Collision between inferred and specified storage "
134
+ f"option:\n{collision}"
135
+ )
136
+ options.update(inherited)
137
+
138
+
139
+ # Compression extensions registered via fsspec.compression.register_compression
140
+ compressions: dict[str, str] = {}
141
+
142
+
143
+ def infer_compression(filename: str) -> str | None:
144
+ """Infer compression, if available, from filename.
145
+
146
+ Infer a named compression type, if registered and available, from filename
147
+ extension. This includes builtin (gz, bz2, zip) compressions, as well as
148
+ optional compressions. See fsspec.compression.register_compression.
149
+ """
150
+ extension = os.path.splitext(filename)[-1].strip(".").lower()
151
+ if extension in compressions:
152
+ return compressions[extension]
153
+ return None
154
+
155
+
156
+ def build_name_function(max_int: float) -> Callable[[int], str]:
157
+ """Returns a function that receives a single integer
158
+ and returns it as a string padded by enough zero characters
159
+ to align with maximum possible integer
160
+
161
+ >>> name_f = build_name_function(57)
162
+
163
+ >>> name_f(7)
164
+ '07'
165
+ >>> name_f(31)
166
+ '31'
167
+ >>> build_name_function(1000)(42)
168
+ '0042'
169
+ >>> build_name_function(999)(42)
170
+ '042'
171
+ >>> build_name_function(0)(0)
172
+ '0'
173
+ """
174
+ # handle corner cases max_int is 0 or exact power of 10
175
+ max_int += 1e-8
176
+
177
+ pad_length = int(math.ceil(math.log10(max_int)))
178
+
179
+ def name_function(i: int) -> str:
180
+ return str(i).zfill(pad_length)
181
+
182
+ return name_function
183
+
184
+
185
+ def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
186
+ r"""Seek current file to file start, file end, or byte after delimiter seq.
187
+
188
+ Seeks file to next chunk delimiter, where chunks are defined on file start,
189
+ a delimiting sequence, and file end. Use file.tell() to see location afterwards.
190
+ Note that file start is a valid split, so must be at offset > 0 to seek for
191
+ delimiter.
192
+
193
+ Parameters
194
+ ----------
195
+ file: a file
196
+ delimiter: bytes
197
+ a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
198
+ blocksize: int
199
+ Number of bytes to read from the file at once.
200
+
201
+
202
+ Returns
203
+ -------
204
+ Returns True if a delimiter was found, False if at file start or end.
205
+
206
+ """
207
+
208
+ if file.tell() == 0:
209
+ # beginning-of-file, return without seek
210
+ return False
211
+
212
+ # Interface is for binary IO, with delimiter as bytes, but initialize last
213
+ # with result of file.read to preserve compatibility with text IO.
214
+ last: bytes | None = None
215
+ while True:
216
+ current = file.read(blocksize)
217
+ if not current:
218
+ # end-of-file without delimiter
219
+ return False
220
+ full = last + current if last else current
221
+ try:
222
+ if delimiter in full:
223
+ i = full.index(delimiter)
224
+ file.seek(file.tell() - (len(full) - i) + len(delimiter))
225
+ return True
226
+ elif len(current) < blocksize:
227
+ # end-of-file without delimiter
228
+ return False
229
+ except (OSError, ValueError):
230
+ pass
231
+ last = full[-len(delimiter) :]
232
+
233
+
234
+ def read_block(
235
+ f: IO[bytes],
236
+ offset: int,
237
+ length: int | None,
238
+ delimiter: bytes | None = None,
239
+ split_before: bool = False,
240
+ ) -> bytes:
241
+ """Read a block of bytes from a file
242
+
243
+ Parameters
244
+ ----------
245
+ f: File
246
+ Open file
247
+ offset: int
248
+ Byte offset to start read
249
+ length: int
250
+ Number of bytes to read, read through end of file if None
251
+ delimiter: bytes (optional)
252
+ Ensure reading starts and stops at delimiter bytestring
253
+ split_before: bool (optional)
254
+ Start/stop read *before* delimiter bytestring.
255
+
256
+
257
+ If using the ``delimiter=`` keyword argument we ensure that the read
258
+ starts and stops at delimiter boundaries that follow the locations
259
+ ``offset`` and ``offset + length``. If ``offset`` is zero then we
260
+ start at zero, regardless of delimiter. The bytestring returned WILL
261
+ include the terminating delimiter string.
262
+
263
+ Examples
264
+ --------
265
+
266
+ >>> from io import BytesIO # doctest: +SKIP
267
+ >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
268
+ >>> read_block(f, 0, 13) # doctest: +SKIP
269
+ b'Alice, 100\\nBo'
270
+
271
+ >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
272
+ b'Alice, 100\\nBob, 200\\n'
273
+
274
+ >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
275
+ b'Bob, 200\\nCharlie, 300'
276
+ """
277
+ if delimiter:
278
+ f.seek(offset)
279
+ found_start_delim = seek_delimiter(f, delimiter, 2**16)
280
+ if length is None:
281
+ return f.read()
282
+ start = f.tell()
283
+ length -= start - offset
284
+
285
+ f.seek(start + length)
286
+ found_end_delim = seek_delimiter(f, delimiter, 2**16)
287
+ end = f.tell()
288
+
289
+ # Adjust split location to before delimiter if seek found the
290
+ # delimiter sequence, not start or end of file.
291
+ if found_start_delim and split_before:
292
+ start -= len(delimiter)
293
+
294
+ if found_end_delim and split_before:
295
+ end -= len(delimiter)
296
+
297
+ offset = start
298
+ length = end - start
299
+
300
+ f.seek(offset)
301
+
302
+ # TODO: allow length to be None and read to the end of the file?
303
+ assert length is not None
304
+ b = f.read(length)
305
+ return b
306
+
307
+
308
+ def tokenize(*args: Any, **kwargs: Any) -> str:
309
+ """Deterministic token
310
+
311
+ (modified from dask.base)
312
+
313
+ >>> tokenize([1, 2, '3'])
314
+ '9d71491b50023b06fc76928e6eddb952'
315
+
316
+ >>> tokenize('Hello') == tokenize('Hello')
317
+ True
318
+ """
319
+ if kwargs:
320
+ args += (kwargs,)
321
+ try:
322
+ h = md5(str(args).encode())
323
+ except ValueError:
324
+ # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
325
+ h = md5(str(args).encode(), usedforsecurity=False)
326
+ return h.hexdigest()
327
+
328
+
329
+ def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
330
+ """Attempt to convert a path-like object to a string.
331
+
332
+ Parameters
333
+ ----------
334
+ filepath: object to be converted
335
+
336
+ Returns
337
+ -------
338
+ filepath_str: maybe a string version of the object
339
+
340
+ Notes
341
+ -----
342
+ Objects supporting the fspath protocol are coerced according to its
343
+ __fspath__ method.
344
+
345
+ For backwards compatibility with older Python version, pathlib.Path
346
+ objects are specially coerced.
347
+
348
+ Any other object is passed through unchanged, which includes bytes,
349
+ strings, buffers, or anything else that's not even path-like.
350
+ """
351
+ if isinstance(filepath, str):
352
+ return filepath
353
+ elif hasattr(filepath, "__fspath__"):
354
+ return filepath.__fspath__()
355
+ elif hasattr(filepath, "path"):
356
+ return filepath.path
357
+ else:
358
+ return filepath # type: ignore[return-value]
359
+
360
+
361
+ def make_instance(
362
+ cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
363
+ ) -> T:
364
+ inst = cls(*args, **kwargs)
365
+ inst._determine_worker() # type: ignore[attr-defined]
366
+ return inst
367
+
368
+
369
+ def common_prefix(paths: Iterable[str]) -> str:
370
+ """For a list of paths, find the shortest prefix common to all"""
371
+ parts = [p.split("/") for p in paths]
372
+ lmax = min(len(p) for p in parts)
373
+ end = 0
374
+ for i in range(lmax):
375
+ end = all(p[i] == parts[0][i] for p in parts)
376
+ if not end:
377
+ break
378
+ i += end
379
+ return "/".join(parts[0][:i])
380
+
381
+
382
+ def other_paths(
383
+ paths: list[str],
384
+ path2: str | list[str],
385
+ exists: bool = False,
386
+ flatten: bool = False,
387
+ ) -> list[str]:
388
+ """In bulk file operations, construct a new file tree from a list of files
389
+
390
+ Parameters
391
+ ----------
392
+ paths: list of str
393
+ The input file tree
394
+ path2: str or list of str
395
+ Root to construct the new list in. If this is already a list of str, we just
396
+ assert it has the right number of elements.
397
+ exists: bool (optional)
398
+ For a str destination, it is already exists (and is a dir), files should
399
+ end up inside.
400
+ flatten: bool (optional)
401
+ Whether to flatten the input directory tree structure so that the output files
402
+ are in the same directory.
403
+
404
+ Returns
405
+ -------
406
+ list of str
407
+ """
408
+
409
+ if isinstance(path2, str):
410
+ path2 = path2.rstrip("/")
411
+
412
+ if flatten:
413
+ path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
414
+ else:
415
+ cp = common_prefix(paths)
416
+ if exists:
417
+ cp = cp.rsplit("/", 1)[0]
418
+ if not cp and all(not s.startswith("/") for s in paths):
419
+ path2 = ["/".join([path2, p]) for p in paths]
420
+ else:
421
+ path2 = [p.replace(cp, path2, 1) for p in paths]
422
+ else:
423
+ assert len(paths) == len(path2)
424
+ return path2
425
+
426
+
427
+ def is_exception(obj: Any) -> bool:
428
+ return isinstance(obj, BaseException)
429
+
430
+
431
+ def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
432
+ return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
433
+
434
+
435
+ def get_protocol(url: str) -> str:
436
+ url = stringify_path(url)
437
+ parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
438
+ if len(parts) > 1:
439
+ return parts[0]
440
+ return "file"
441
+
442
+
443
+ def can_be_local(path: str) -> bool:
444
+ """Can the given URL be used with open_local?"""
445
+ from fsspec import get_filesystem_class
446
+
447
+ try:
448
+ return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
449
+ except (ValueError, ImportError):
450
+ # not in registry or import failed
451
+ return False
452
+
453
+
454
+ def get_package_version_without_import(name: str) -> str | None:
455
+ """For given package name, try to find the version without importing it
456
+
457
+ Import and package.__version__ is still the backup here, so an import
458
+ *might* happen.
459
+
460
+ Returns either the version string, or None if the package
461
+ or the version was not readily found.
462
+ """
463
+ if name in sys.modules:
464
+ mod = sys.modules[name]
465
+ if hasattr(mod, "__version__"):
466
+ return mod.__version__
467
+ try:
468
+ return version(name)
469
+ except: # noqa: E722
470
+ pass
471
+ try:
472
+ import importlib
473
+
474
+ mod = importlib.import_module(name)
475
+ return mod.__version__
476
+ except (ImportError, AttributeError):
477
+ return None
478
+
479
+
480
+ def setup_logging(
481
+ logger: logging.Logger | None = None,
482
+ logger_name: str | None = None,
483
+ level: str = "DEBUG",
484
+ clear: bool = True,
485
+ ) -> logging.Logger:
486
+ if logger is None and logger_name is None:
487
+ raise ValueError("Provide either logger object or logger name")
488
+ logger = logger or logging.getLogger(logger_name)
489
+ handle = logging.StreamHandler()
490
+ formatter = logging.Formatter(
491
+ "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
492
+ )
493
+ handle.setFormatter(formatter)
494
+ if clear:
495
+ logger.handlers.clear()
496
+ logger.addHandler(handle)
497
+ logger.setLevel(level)
498
+ return logger
499
+
500
+
501
+ def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
502
+ return fs.unstrip_protocol(name)
503
+
504
+
505
+ def mirror_from(
506
+ origin_name: str, methods: Iterable[str]
507
+ ) -> Callable[[type[T]], type[T]]:
508
+ """Mirror attributes and methods from the given
509
+ origin_name attribute of the instance to the
510
+ decorated class"""
511
+
512
+ def origin_getter(method: str, self: Any) -> Any:
513
+ origin = getattr(self, origin_name)
514
+ return getattr(origin, method)
515
+
516
+ def wrapper(cls: type[T]) -> type[T]:
517
+ for method in methods:
518
+ wrapped_method = partial(origin_getter, method)
519
+ setattr(cls, method, property(wrapped_method))
520
+ return cls
521
+
522
+ return wrapper
523
+
524
+
525
+ @contextlib.contextmanager
526
+ def nullcontext(obj: T) -> Iterator[T]:
527
+ yield obj
528
+
529
+
530
+ def merge_offset_ranges(
531
+ paths: list[str],
532
+ starts: list[int] | int,
533
+ ends: list[int] | int,
534
+ max_gap: int = 0,
535
+ max_block: int | None = None,
536
+ sort: bool = True,
537
+ ) -> tuple[list[str], list[int], list[int]]:
538
+ """Merge adjacent byte-offset ranges when the inter-range
539
+ gap is <= `max_gap`, and when the merged byte range does not
540
+ exceed `max_block` (if specified). By default, this function
541
+ will re-order the input paths and byte ranges to ensure sorted
542
+ order. If the user can guarantee that the inputs are already
543
+ sorted, passing `sort=False` will skip the re-ordering.
544
+ """
545
+ # Check input
546
+ if not isinstance(paths, list):
547
+ raise TypeError
548
+ if not isinstance(starts, list):
549
+ starts = [starts] * len(paths)
550
+ if not isinstance(ends, list):
551
+ ends = [ends] * len(paths)
552
+ if len(starts) != len(paths) or len(ends) != len(paths):
553
+ raise ValueError
554
+
555
+ # Early Return
556
+ if len(starts) <= 1:
557
+ return paths, starts, ends
558
+
559
+ starts = [s or 0 for s in starts]
560
+ # Sort by paths and then ranges if `sort=True`
561
+ if sort:
562
+ paths, starts, ends = (
563
+ list(v)
564
+ for v in zip(
565
+ *sorted(
566
+ zip(paths, starts, ends),
567
+ )
568
+ )
569
+ )
570
+
571
+ if paths:
572
+ # Loop through the coupled `paths`, `starts`, and
573
+ # `ends`, and merge adjacent blocks when appropriate
574
+ new_paths = paths[:1]
575
+ new_starts = starts[:1]
576
+ new_ends = ends[:1]
577
+ for i in range(1, len(paths)):
578
+ if paths[i] == paths[i - 1] and new_ends[-1] is None:
579
+ continue
580
+ elif (
581
+ paths[i] != paths[i - 1]
582
+ or ((starts[i] - new_ends[-1]) > max_gap)
583
+ or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
584
+ ):
585
+ # Cannot merge with previous block.
586
+ # Add new `paths`, `starts`, and `ends` elements
587
+ new_paths.append(paths[i])
588
+ new_starts.append(starts[i])
589
+ new_ends.append(ends[i])
590
+ else:
591
+ # Merge with previous block by updating the
592
+ # last element of `ends`
593
+ new_ends[-1] = ends[i]
594
+ return new_paths, new_starts, new_ends
595
+
596
+ # `paths` is empty. Just return input lists
597
+ return paths, starts, ends
598
+
599
+
600
+ def file_size(filelike: IO[bytes]) -> int:
601
+ """Find length of any open read-mode file-like"""
602
+ pos = filelike.tell()
603
+ try:
604
+ return filelike.seek(0, 2)
605
+ finally:
606
+ filelike.seek(pos)
607
+
608
+
609
+ @contextlib.contextmanager
610
+ def atomic_write(path: str, mode: str = "wb"):
611
+ """
612
+ A context manager that opens a temporary file next to `path` and, on exit,
613
+ replaces `path` with the temporary file, thereby updating `path`
614
+ atomically.
615
+ """
616
+ fd, fn = tempfile.mkstemp(
617
+ dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
618
+ )
619
+ try:
620
+ with open(fd, mode) as fp:
621
+ yield fp
622
+ except BaseException:
623
+ with contextlib.suppress(FileNotFoundError):
624
+ os.unlink(fn)
625
+ raise
626
+ else:
627
+ os.replace(fn, path)
628
+
629
+
630
+ def _translate(pat, STAR, QUESTION_MARK):
631
+ # Copied from: https://github.com/python/cpython/pull/106703.
632
+ res: list[str] = []
633
+ add = res.append
634
+ i, n = 0, len(pat)
635
+ while i < n:
636
+ c = pat[i]
637
+ i = i + 1
638
+ if c == "*":
639
+ # compress consecutive `*` into one
640
+ if (not res) or res[-1] is not STAR:
641
+ add(STAR)
642
+ elif c == "?":
643
+ add(QUESTION_MARK)
644
+ elif c == "[":
645
+ j = i
646
+ if j < n and pat[j] == "!":
647
+ j = j + 1
648
+ if j < n and pat[j] == "]":
649
+ j = j + 1
650
+ while j < n and pat[j] != "]":
651
+ j = j + 1
652
+ if j >= n:
653
+ add("\\[")
654
+ else:
655
+ stuff = pat[i:j]
656
+ if "-" not in stuff:
657
+ stuff = stuff.replace("\\", r"\\")
658
+ else:
659
+ chunks = []
660
+ k = i + 2 if pat[i] == "!" else i + 1
661
+ while True:
662
+ k = pat.find("-", k, j)
663
+ if k < 0:
664
+ break
665
+ chunks.append(pat[i:k])
666
+ i = k + 1
667
+ k = k + 3
668
+ chunk = pat[i:j]
669
+ if chunk:
670
+ chunks.append(chunk)
671
+ else:
672
+ chunks[-1] += "-"
673
+ # Remove empty ranges -- invalid in RE.
674
+ for k in range(len(chunks) - 1, 0, -1):
675
+ if chunks[k - 1][-1] > chunks[k][0]:
676
+ chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
677
+ del chunks[k]
678
+ # Escape backslashes and hyphens for set difference (--).
679
+ # Hyphens that create ranges shouldn't be escaped.
680
+ stuff = "-".join(
681
+ s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
682
+ )
683
+ # Escape set operations (&&, ~~ and ||).
684
+ stuff = re.sub(r"([&~|])", r"\\\1", stuff)
685
+ i = j + 1
686
+ if not stuff:
687
+ # Empty range: never match.
688
+ add("(?!)")
689
+ elif stuff == "!":
690
+ # Negated empty range: match any character.
691
+ add(".")
692
+ else:
693
+ if stuff[0] == "!":
694
+ stuff = "^" + stuff[1:]
695
+ elif stuff[0] in ("^", "["):
696
+ stuff = "\\" + stuff
697
+ add(f"[{stuff}]")
698
+ else:
699
+ add(re.escape(c))
700
+ assert i == n
701
+ return res
702
+
703
+
704
+ def glob_translate(pat):
705
+ # Copied from: https://github.com/python/cpython/pull/106703.
706
+ # The keyword parameters' values are fixed to:
707
+ # recursive=True, include_hidden=True, seps=None
708
+ """Translate a pathname with shell wildcards to a regular expression."""
709
+ if os.path.altsep:
710
+ seps = os.path.sep + os.path.altsep
711
+ else:
712
+ seps = os.path.sep
713
+ escaped_seps = "".join(map(re.escape, seps))
714
+ any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
715
+ not_sep = f"[^{escaped_seps}]"
716
+ one_last_segment = f"{not_sep}+"
717
+ one_segment = f"{one_last_segment}{any_sep}"
718
+ any_segments = f"(?:.+{any_sep})?"
719
+ any_last_segments = ".*"
720
+ results = []
721
+ parts = re.split(any_sep, pat)
722
+ last_part_idx = len(parts) - 1
723
+ for idx, part in enumerate(parts):
724
+ if part == "*":
725
+ results.append(one_segment if idx < last_part_idx else one_last_segment)
726
+ continue
727
+ if part == "**":
728
+ results.append(any_segments if idx < last_part_idx else any_last_segments)
729
+ continue
730
+ elif "**" in part:
731
+ raise ValueError(
732
+ "Invalid pattern: '**' can only be an entire path component"
733
+ )
734
+ if part:
735
+ results.extend(_translate(part, f"{not_sep}*", not_sep))
736
+ if idx < last_part_idx:
737
+ results.append(any_sep)
738
+ res = "".join(results)
739
+ return rf"(?s:{res})\Z"
.venv/lib/python3.11/site-packages/functorch/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (907 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/_src/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # This file has moved to under torch/_functorch. It is not public API.
2
+ # If you are not a PyTorch developer and you are relying on the following
3
+ # imports, please file an issue.
4
+ from torch._functorch.aot_autograd import (
5
+ aot_autograd_decompositions,
6
+ KNOWN_TYPES,
7
+ PytreeThunk,
8
+ )
.venv/lib/python3.11/site-packages/functorch/_src/aot_autograd/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (375 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # This file has moved to under torch/_functorch. It is not public API.
2
+ # If you are not a PyTorch developer and you are relying on the following
3
+ # imports, please file an issue.
4
+ from torch._functorch.eager_transforms import (
5
+ _assert_wrapped_functional,
6
+ _unwrap_functional_tensor,
7
+ )
.venv/lib/python3.11/site-packages/functorch/_src/eager_transforms/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (364 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This file has moved to under torch/_functorch. It is not public API.
2
+ # If you are not a PyTorch developer and you are relying on the following
3
+ # imports, please file an issue.
4
+ from torch._functorch.make_functional import _swap_state
.venv/lib/python3.11/site-packages/functorch/_src/make_functional/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (287 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file has moved to under torch/_functorch. It is not public API.
2
+ # If you are not a PyTorch developer and you are relying on the following
3
+ # imports, please file an issue.
4
+ from torch._functorch.vmap import (
5
+ _add_batch_dim,
6
+ _broadcast_to_and_flatten,
7
+ _create_batched_inputs,
8
+ _get_name,
9
+ _process_batched_inputs,
10
+ _remove_batch_dim,
11
+ _unwrap_batched,
12
+ _validate_and_get_batch_size,
13
+ Tensor,
14
+ tree_flatten,
15
+ tree_unflatten,
16
+ )
.venv/lib/python3.11/site-packages/functorch/_src/vmap/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (663 Bytes). View file
 
.venv/lib/python3.11/site-packages/functorch/compile/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch._functorch import config
2
+ from torch._functorch.aot_autograd import (
3
+ aot_function,
4
+ aot_module,
5
+ aot_module_simplified,
6
+ compiled_function,
7
+ compiled_module,
8
+ get_aot_compilation_context,
9
+ get_aot_graph_name,
10
+ get_graph_being_compiled,
11
+ make_boxed_compiler,
12
+ make_boxed_func,
13
+ )
14
+ from torch._functorch.compilers import (
15
+ debug_compile,
16
+ default_decompositions,
17
+ draw_graph_compile,
18
+ memory_efficient_fusion,
19
+ nnc_jit,
20
+ nop,
21
+ print_compile,
22
+ ts_compile,
23
+ )
24
+ from torch._functorch.fx_minifier import minifier
25
+ from torch._functorch.partitioners import (
26
+ default_partition,
27
+ draw_graph,
28
+ min_cut_rematerialization_partition,
29
+ )
30
+ from torch._functorch.python_key import pythonkey_decompose
.venv/lib/python3.11/site-packages/functorch/compile/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.39 kB). View file
 
.venv/lib/python3.11/site-packages/functorch/dim/__init__.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dis
2
+ import inspect
3
+ from typing import Sequence, Union
4
+
5
+ import functorch._C
6
+ import torch
7
+ from functorch._C import dim as _C
8
+
9
+ from .tree_map import tree_flatten, tree_map
10
+ from .wrap_type import wrap_type
11
+
12
+
13
+ _C._patch_tensor_class()
14
+ dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
15
+
16
+
17
+ class DimensionMismatchError(Exception):
18
+ pass
19
+
20
+
21
+ class DimensionBindError(Exception):
22
+ pass
23
+
24
+
25
+ from . import op_properties
26
+
27
+
28
+ # use dict to avoid writing C++ bindings for set
29
+ pointwise = dict.fromkeys(op_properties.pointwise, True)
30
+
31
+ use_c = True
32
+ if not use_c:
33
+ from . import reference
34
+
35
+
36
+ class _Tensor:
37
+ # fast path around slow wrapping/unwrapping logic for simply queries used
38
+ # by the implementation...
39
+
40
+ @property
41
+ def dims(self):
42
+ return tuple(d for d in self._levels if isinstance(d, Dim))
43
+
44
+ def dim(self):
45
+ return self.ndim
46
+
47
+ if use_c:
48
+ __torch_function__ = classmethod(_C.__torch_function__)
49
+ expand = _C._instancemethod(_C.expand)
50
+ else:
51
+ __torch_function__ = reference.__torch_function__
52
+ expand = reference.expand
53
+
54
+ index = _C._instancemethod(_C.index)
55
+
56
+ def __repr__(self):
57
+ tensor, levels, ndim = self._tensor, self._levels, self.ndim
58
+ return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
59
+
60
+
61
+ TensorLike = (_Tensor, torch.Tensor)
62
+
63
+
64
+ class Dim(_C.Dim, _Tensor):
65
+ # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precendence.
66
+ # Tensor defines format, but we want to print Dims with special formatting
67
+ __format__ = object.__format__
68
+
69
+
70
+ class Tensor(_Tensor, _C.Tensor):
71
+ if not use_c:
72
+ from_batched = staticmethod(_C.Tensor_from_batched)
73
+ from_positional = staticmethod(_C.Tensor_from_positional)
74
+ sum = _C._instancemethod(_C.Tensor_sum)
75
+
76
+
77
+ def cat(tensors, dim, new_dim):
78
+ n = dims()
79
+ return stack(tensors, n, dim).index([n, dim], new_dim)
80
+
81
+
82
+ if use_c:
83
+ _wrap = _C._wrap
84
+
85
+ def _def(name, *args, **kwargs):
86
+ orig = getattr(torch.Tensor, name)
87
+ setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
88
+
89
+ t__getitem__ = _C._instancemethod(_C.__getitem__)
90
+ stack = _C.stack
91
+ split = _C._instancemethod(_C.split)
92
+ else:
93
+ _wrap, _def = reference._wrap, reference._def
94
+ t__getitem__ = reference.t__getitem__
95
+ stack = reference.stack
96
+ split = reference.split
97
+
98
+ # note: there is no python reference
99
+ t__setitem__ = _C._instancemethod(_C.__setitem__)
100
+ # this is patched in the C API because otherwise torch.Tensor will
101
+ # no longer be considered a sequence and things will break
102
+ # torch.Tensor.__getitem__ = t__getitem__
103
+
104
+ _Tensor.__getitem__ = t__getitem__
105
+ # torch.Tensor.__setitem__ = t__setitem__
106
+ _Tensor.__setitem__ = t__setitem__
107
+
108
+ torch.Tensor.split = split
109
+ _Tensor.split = split
110
+ torch.Tensor.expand = _C._instancemethod(_C.expand)
111
+ torch.Tensor.index = _C._instancemethod(_C.index)
112
+ wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__)
113
+ del _Tensor.ndim
114
+
115
+ if use_c:
116
+ _Tensor.order = _C._instancemethod(_C.order)
117
+ else:
118
+ _Tensor.order = reference.positional
119
+
120
+ _def("mean")
121
+ _def("sum")
122
+ _def("all")
123
+ _def("amax")
124
+ _def("amin")
125
+ _def("aminmax")
126
+ _def("any")
127
+ _def("count_nonzero")
128
+ _def("logsumexp")
129
+ _def("nanmean")
130
+ _def("nansum")
131
+ _def("prod")
132
+ _def("std", keepdim_offset=2)
133
+ _def("var", keepdim_offset=2)
134
+ _def("max", single_dim=True)
135
+ _def("min", single_dim=True)
136
+ _def("argmax", single_dim=True)
137
+ _def("argmin", single_dim=True)
138
+ _def("kthvalue", single_dim=True)
139
+ _def("median", single_dim=True)
140
+ _def("nanmedian", single_dim=True)
141
+ _def("mode", single_dim=True)
142
+ _def("sort", reduce=False)
143
+ _def("argsort", reduce=False)
144
+ _def("unbind", single_dim=True)
145
+ _def("chunk", dim_offset=1, reduce=False)
146
+ _def("cummax", single_dim=True, reduce=False)
147
+ _def("cummin", single_dim=True, reduce=False)
148
+ _def("cumprod", single_dim=True, reduce=False)
149
+ _def("cumprod_", single_dim=True, reduce=False)
150
+ _def("cumsum", single_dim=True, reduce=False)
151
+ _def("cumsum_", single_dim=True, reduce=False)
152
+ _def("logcumsumexp", single_dim=True, reduce=False)
153
+ _def("renorm", dim_offset=1, single_dim=True, reduce=False)
154
+ _def("softmax", single_dim=True, reduce=False)
155
+ softmax = _wrap(torch.nn.functional.softmax, single_dim=True, reduce=False)
156
+
157
+ # stuff to handle in the future, because they require special
158
+ # binding logic for dims
159
+ # cross
160
+ # diag_embed
161
+ # diagonal
162
+ # diagonal_scatter
163
+ # diff
164
+ # nanquantile
165
+ # quantile
166
+ # roll
167
+ # rot90
168
+ # topk (new dimes on output)
169
+ # should these all be subsumed by inplace indexing?
170
+ # index_add_
171
+ # index_add
172
+ # index_copy
173
+ # index_copy_
174
+ # index_fill
175
+ # index_fill_
176
+ # index_select
177
+ # scatter
178
+ # scatter_
179
+ # scatter_add
180
+ # scatter_add_
181
+ # scatter_reduce
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (7.86 kB). View file
 
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/batch_tensor.cpython-311.pyc ADDED
Binary file (1.25 kB). View file
 
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/delayed_mul_tensor.cpython-311.pyc ADDED
Binary file (5.57 kB). View file
 
.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/dim.cpython-311.pyc ADDED
Binary file (7.06 kB). View file