Mingke977 commited on
Commit
f0c5df7
·
verified ·
1 Parent(s): ca4b037

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. venv/lib/python3.10/site-packages/fsspec/__pycache__/__init__.cpython-310.pyc +0 -0
  2. venv/lib/python3.10/site-packages/fsspec/__pycache__/_version.cpython-310.pyc +0 -0
  3. venv/lib/python3.10/site-packages/fsspec/__pycache__/archive.cpython-310.pyc +0 -0
  4. venv/lib/python3.10/site-packages/fsspec/__pycache__/asyn.cpython-310.pyc +0 -0
  5. venv/lib/python3.10/site-packages/fsspec/__pycache__/caching.cpython-310.pyc +0 -0
  6. venv/lib/python3.10/site-packages/fsspec/__pycache__/callbacks.cpython-310.pyc +0 -0
  7. venv/lib/python3.10/site-packages/fsspec/__pycache__/compression.cpython-310.pyc +0 -0
  8. venv/lib/python3.10/site-packages/fsspec/__pycache__/config.cpython-310.pyc +0 -0
  9. venv/lib/python3.10/site-packages/fsspec/__pycache__/conftest.cpython-310.pyc +0 -0
  10. venv/lib/python3.10/site-packages/fsspec/__pycache__/core.cpython-310.pyc +0 -0
  11. venv/lib/python3.10/site-packages/fsspec/__pycache__/dircache.cpython-310.pyc +0 -0
  12. venv/lib/python3.10/site-packages/fsspec/__pycache__/exceptions.cpython-310.pyc +0 -0
  13. venv/lib/python3.10/site-packages/fsspec/__pycache__/fuse.cpython-310.pyc +0 -0
  14. venv/lib/python3.10/site-packages/fsspec/__pycache__/generic.cpython-310.pyc +0 -0
  15. venv/lib/python3.10/site-packages/fsspec/__pycache__/gui.cpython-310.pyc +0 -0
  16. venv/lib/python3.10/site-packages/fsspec/__pycache__/json.cpython-310.pyc +0 -0
  17. venv/lib/python3.10/site-packages/fsspec/__pycache__/mapping.cpython-310.pyc +0 -0
  18. venv/lib/python3.10/site-packages/fsspec/__pycache__/parquet.cpython-310.pyc +0 -0
  19. venv/lib/python3.10/site-packages/fsspec/__pycache__/registry.cpython-310.pyc +0 -0
  20. venv/lib/python3.10/site-packages/fsspec/__pycache__/spec.cpython-310.pyc +0 -0
  21. venv/lib/python3.10/site-packages/fsspec/__pycache__/transaction.cpython-310.pyc +0 -0
  22. venv/lib/python3.10/site-packages/fsspec/__pycache__/utils.cpython-310.pyc +0 -0
  23. venv/lib/python3.10/site-packages/fsspec/implementations/__init__.py +0 -0
  24. venv/lib/python3.10/site-packages/fsspec/implementations/arrow.py +312 -0
  25. venv/lib/python3.10/site-packages/fsspec/implementations/asyn_wrapper.py +124 -0
  26. venv/lib/python3.10/site-packages/fsspec/implementations/cache_mapper.py +75 -0
  27. venv/lib/python3.10/site-packages/fsspec/implementations/cache_metadata.py +231 -0
  28. venv/lib/python3.10/site-packages/fsspec/implementations/cached.py +1021 -0
  29. venv/lib/python3.10/site-packages/fsspec/implementations/chained.py +23 -0
  30. venv/lib/python3.10/site-packages/fsspec/implementations/dask.py +152 -0
  31. venv/lib/python3.10/site-packages/fsspec/implementations/data.py +57 -0
  32. venv/lib/python3.10/site-packages/fsspec/implementations/dbfs.py +496 -0
  33. venv/lib/python3.10/site-packages/fsspec/implementations/dirfs.py +389 -0
  34. venv/lib/python3.10/site-packages/fsspec/implementations/ftp.py +437 -0
  35. venv/lib/python3.10/site-packages/fsspec/implementations/gist.py +241 -0
  36. venv/lib/python3.10/site-packages/fsspec/implementations/git.py +114 -0
  37. venv/lib/python3.10/site-packages/fsspec/implementations/github.py +333 -0
  38. venv/lib/python3.10/site-packages/fsspec/implementations/http.py +897 -0
  39. venv/lib/python3.10/site-packages/fsspec/implementations/http_sync.py +937 -0
  40. venv/lib/python3.10/site-packages/fsspec/implementations/libarchive.py +213 -0
  41. venv/lib/python3.10/site-packages/fsspec/parquet.py +572 -0
  42. venv/lib/python3.10/site-packages/fsspec/registry.py +333 -0
  43. venv/lib/python3.10/site-packages/fsspec/spec.py +2281 -0
  44. venv/lib/python3.10/site-packages/fsspec/transaction.py +90 -0
  45. venv/lib/python3.10/site-packages/fsspec/utils.py +748 -0
  46. venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/INSTALLER +1 -0
  47. venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/METADATA +625 -0
  48. venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/RECORD +68 -0
  49. venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/WHEEL +4 -0
  50. venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/licenses/LICENSE.md +27 -0
venv/lib/python3.10/site-packages/fsspec/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.6 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/_version.cpython-310.pyc ADDED
Binary file (763 Bytes). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/archive.cpython-310.pyc ADDED
Binary file (2.99 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/asyn.cpython-310.pyc ADDED
Binary file (29.6 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/caching.cpython-310.pyc ADDED
Binary file (26.4 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/callbacks.cpython-310.pyc ADDED
Binary file (11 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/compression.cpython-310.pyc ADDED
Binary file (5.38 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/config.cpython-310.pyc ADDED
Binary file (3.93 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/conftest.cpython-310.pyc ADDED
Binary file (3.86 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/core.cpython-310.pyc ADDED
Binary file (22.6 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/dircache.cpython-310.pyc ADDED
Binary file (3.52 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (841 Bytes). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/fuse.cpython-310.pyc ADDED
Binary file (10.3 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/generic.cpython-310.pyc ADDED
Binary file (12.6 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/gui.cpython-310.pyc ADDED
Binary file (14.7 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/json.cpython-310.pyc ADDED
Binary file (4.65 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/mapping.cpython-310.pyc ADDED
Binary file (9.17 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/parquet.cpython-310.pyc ADDED
Binary file (14.5 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/registry.cpython-310.pyc ADDED
Binary file (9.65 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/spec.cpython-310.pyc ADDED
Binary file (67.7 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/transaction.cpython-310.pyc ADDED
Binary file (3.31 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/__pycache__/utils.cpython-310.pyc ADDED
Binary file (21 kB). View file
 
venv/lib/python3.10/site-packages/fsspec/implementations/__init__.py ADDED
File without changes
venv/lib/python3.10/site-packages/fsspec/implementations/arrow.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import errno
2
+ import io
3
+ import os
4
+ import secrets
5
+ import shutil
6
+ from contextlib import suppress
7
+ from functools import cached_property, wraps
8
+ from urllib.parse import parse_qs
9
+
10
+ from fsspec.spec import AbstractFileSystem
11
+ from fsspec.utils import (
12
+ get_package_version_without_import,
13
+ infer_storage_options,
14
+ mirror_from,
15
+ tokenize,
16
+ )
17
+
18
+
19
+ def wrap_exceptions(func):
20
+ @wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ try:
23
+ return func(*args, **kwargs)
24
+ except OSError as exception:
25
+ if not exception.args:
26
+ raise
27
+
28
+ message, *args = exception.args
29
+ if isinstance(message, str) and "does not exist" in message:
30
+ raise FileNotFoundError(errno.ENOENT, message) from exception
31
+ else:
32
+ raise
33
+
34
+ return wrapper
35
+
36
+
37
+ PYARROW_VERSION = None
38
+
39
+
40
+ class ArrowFSWrapper(AbstractFileSystem):
41
+ """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
42
+
43
+ Parameters
44
+ ----------
45
+ fs : pyarrow.fs.FileSystem
46
+
47
+ """
48
+
49
+ root_marker = "/"
50
+
51
+ def __init__(self, fs, **kwargs):
52
+ global PYARROW_VERSION
53
+ PYARROW_VERSION = get_package_version_without_import("pyarrow")
54
+ self.fs = fs
55
+ super().__init__(**kwargs)
56
+
57
+ @property
58
+ def protocol(self):
59
+ return self.fs.type_name
60
+
61
+ @cached_property
62
+ def fsid(self):
63
+ return "hdfs_" + tokenize(self.fs.host, self.fs.port)
64
+
65
+ @classmethod
66
+ def _strip_protocol(cls, path):
67
+ ops = infer_storage_options(path)
68
+ path = ops["path"]
69
+ if path.startswith("//"):
70
+ # special case for "hdfs://path" (without the triple slash)
71
+ path = path[1:]
72
+ return path
73
+
74
+ def ls(self, path, detail=False, **kwargs):
75
+ path = self._strip_protocol(path)
76
+ from pyarrow.fs import FileSelector
77
+
78
+ try:
79
+ entries = [
80
+ self._make_entry(entry)
81
+ for entry in self.fs.get_file_info(FileSelector(path))
82
+ ]
83
+ except (FileNotFoundError, NotADirectoryError):
84
+ entries = [self.info(path, **kwargs)]
85
+ if detail:
86
+ return entries
87
+ else:
88
+ return [entry["name"] for entry in entries]
89
+
90
+ def info(self, path, **kwargs):
91
+ path = self._strip_protocol(path)
92
+ [info] = self.fs.get_file_info([path])
93
+ return self._make_entry(info)
94
+
95
+ def exists(self, path):
96
+ path = self._strip_protocol(path)
97
+ try:
98
+ self.info(path)
99
+ except FileNotFoundError:
100
+ return False
101
+ else:
102
+ return True
103
+
104
+ def _make_entry(self, info):
105
+ from pyarrow.fs import FileType
106
+
107
+ if info.type is FileType.Directory:
108
+ kind = "directory"
109
+ elif info.type is FileType.File:
110
+ kind = "file"
111
+ elif info.type is FileType.NotFound:
112
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
113
+ else:
114
+ kind = "other"
115
+
116
+ return {
117
+ "name": info.path,
118
+ "size": info.size,
119
+ "type": kind,
120
+ "mtime": info.mtime,
121
+ }
122
+
123
+ @wrap_exceptions
124
+ def cp_file(self, path1, path2, **kwargs):
125
+ path1 = self._strip_protocol(path1).rstrip("/")
126
+ path2 = self._strip_protocol(path2).rstrip("/")
127
+
128
+ with self._open(path1, "rb") as lstream:
129
+ tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
130
+ try:
131
+ with self.open(tmp_fname, "wb") as rstream:
132
+ shutil.copyfileobj(lstream, rstream)
133
+ self.fs.move(tmp_fname, path2)
134
+ except BaseException:
135
+ with suppress(FileNotFoundError):
136
+ self.fs.delete_file(tmp_fname)
137
+ raise
138
+
139
+ @wrap_exceptions
140
+ def mv(self, path1, path2, **kwargs):
141
+ path1 = self._strip_protocol(path1).rstrip("/")
142
+ path2 = self._strip_protocol(path2).rstrip("/")
143
+ self.fs.move(path1, path2)
144
+
145
+ @wrap_exceptions
146
+ def rm_file(self, path):
147
+ path = self._strip_protocol(path)
148
+ self.fs.delete_file(path)
149
+
150
+ @wrap_exceptions
151
+ def rm(self, path, recursive=False, maxdepth=None):
152
+ path = self._strip_protocol(path).rstrip("/")
153
+ if self.isdir(path):
154
+ if recursive:
155
+ self.fs.delete_dir(path)
156
+ else:
157
+ raise ValueError("Can't delete directories without recursive=False")
158
+ else:
159
+ self.fs.delete_file(path)
160
+
161
+ @wrap_exceptions
162
+ def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
163
+ if mode == "rb":
164
+ if seekable:
165
+ method = self.fs.open_input_file
166
+ else:
167
+ method = self.fs.open_input_stream
168
+ elif mode == "wb":
169
+ method = self.fs.open_output_stream
170
+ elif mode == "ab":
171
+ method = self.fs.open_append_stream
172
+ else:
173
+ raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
174
+
175
+ _kwargs = {}
176
+ if mode != "rb" or not seekable:
177
+ if int(PYARROW_VERSION.split(".")[0]) >= 4:
178
+ # disable compression auto-detection
179
+ _kwargs["compression"] = None
180
+ stream = method(path, **_kwargs)
181
+
182
+ return ArrowFile(self, stream, path, mode, block_size, **kwargs)
183
+
184
+ @wrap_exceptions
185
+ def mkdir(self, path, create_parents=True, **kwargs):
186
+ path = self._strip_protocol(path)
187
+ if create_parents:
188
+ self.makedirs(path, exist_ok=True)
189
+ else:
190
+ self.fs.create_dir(path, recursive=False)
191
+
192
+ @wrap_exceptions
193
+ def makedirs(self, path, exist_ok=False):
194
+ path = self._strip_protocol(path)
195
+ self.fs.create_dir(path, recursive=True)
196
+
197
+ @wrap_exceptions
198
+ def rmdir(self, path):
199
+ path = self._strip_protocol(path)
200
+ self.fs.delete_dir(path)
201
+
202
+ @wrap_exceptions
203
+ def modified(self, path):
204
+ path = self._strip_protocol(path)
205
+ return self.fs.get_file_info(path).mtime
206
+
207
+ def cat_file(self, path, start=None, end=None, **kwargs):
208
+ kwargs.setdefault("seekable", start not in [None, 0])
209
+ return super().cat_file(path, start=None, end=None, **kwargs)
210
+
211
+ def get_file(self, rpath, lpath, **kwargs):
212
+ kwargs.setdefault("seekable", False)
213
+ super().get_file(rpath, lpath, **kwargs)
214
+
215
+
216
+ @mirror_from(
217
+ "stream",
218
+ [
219
+ "read",
220
+ "seek",
221
+ "tell",
222
+ "write",
223
+ "readable",
224
+ "writable",
225
+ "close",
226
+ "seekable",
227
+ ],
228
+ )
229
+ class ArrowFile(io.IOBase):
230
+ def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
231
+ self.path = path
232
+ self.mode = mode
233
+
234
+ self.fs = fs
235
+ self.stream = stream
236
+
237
+ self.blocksize = self.block_size = block_size
238
+ self.kwargs = kwargs
239
+
240
+ def __enter__(self):
241
+ return self
242
+
243
+ @property
244
+ def size(self):
245
+ if self.stream.seekable():
246
+ return self.stream.size()
247
+ return None
248
+
249
+ def __exit__(self, *args):
250
+ return self.close()
251
+
252
+
253
+ class HadoopFileSystem(ArrowFSWrapper):
254
+ """A wrapper on top of the pyarrow.fs.HadoopFileSystem
255
+ to connect it's interface with fsspec"""
256
+
257
+ protocol = "hdfs"
258
+
259
+ def __init__(
260
+ self,
261
+ host="default",
262
+ port=0,
263
+ user=None,
264
+ kerb_ticket=None,
265
+ replication=3,
266
+ extra_conf=None,
267
+ **kwargs,
268
+ ):
269
+ """
270
+
271
+ Parameters
272
+ ----------
273
+ host: str
274
+ Hostname, IP or "default" to try to read from Hadoop config
275
+ port: int
276
+ Port to connect on, or default from Hadoop config if 0
277
+ user: str or None
278
+ If given, connect as this username
279
+ kerb_ticket: str or None
280
+ If given, use this ticket for authentication
281
+ replication: int
282
+ set replication factor of file for write operations. default value is 3.
283
+ extra_conf: None or dict
284
+ Passed on to HadoopFileSystem
285
+ """
286
+ from pyarrow.fs import HadoopFileSystem
287
+
288
+ fs = HadoopFileSystem(
289
+ host=host,
290
+ port=port,
291
+ user=user,
292
+ kerb_ticket=kerb_ticket,
293
+ replication=replication,
294
+ extra_conf=extra_conf,
295
+ )
296
+ super().__init__(fs=fs, **kwargs)
297
+
298
+ @staticmethod
299
+ def _get_kwargs_from_urls(path):
300
+ ops = infer_storage_options(path)
301
+ out = {}
302
+ if ops.get("host", None):
303
+ out["host"] = ops["host"]
304
+ if ops.get("username", None):
305
+ out["user"] = ops["username"]
306
+ if ops.get("port", None):
307
+ out["port"] = ops["port"]
308
+ if ops.get("url_query", None):
309
+ queries = parse_qs(ops["url_query"])
310
+ if queries.get("replication", None):
311
+ out["replication"] = int(queries["replication"][0])
312
+ return out
venv/lib/python3.10/site-packages/fsspec/implementations/asyn_wrapper.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import functools
3
+ import inspect
4
+
5
+ import fsspec
6
+ from fsspec.asyn import AsyncFileSystem, running_async
7
+
8
+ from .chained import ChainedFileSystem
9
+
10
+
11
+ def async_wrapper(func, obj=None, semaphore=None):
12
+ """
13
+ Wraps a synchronous function to make it awaitable.
14
+
15
+ Parameters
16
+ ----------
17
+ func : callable
18
+ The synchronous function to wrap.
19
+ obj : object, optional
20
+ The instance to bind the function to, if applicable.
21
+ semaphore : asyncio.Semaphore, optional
22
+ A semaphore to limit concurrent calls.
23
+
24
+ Returns
25
+ -------
26
+ coroutine
27
+ An awaitable version of the function.
28
+ """
29
+
30
+ @functools.wraps(func)
31
+ async def wrapper(*args, **kwargs):
32
+ if semaphore:
33
+ async with semaphore:
34
+ return await asyncio.to_thread(func, *args, **kwargs)
35
+ return await asyncio.to_thread(func, *args, **kwargs)
36
+
37
+ return wrapper
38
+
39
+
40
+ class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
41
+ """
42
+ A wrapper class to convert a synchronous filesystem into an asynchronous one.
43
+
44
+ This class takes an existing synchronous filesystem implementation and wraps all
45
+ its methods to provide an asynchronous interface.
46
+
47
+ Parameters
48
+ ----------
49
+ sync_fs : AbstractFileSystem
50
+ The synchronous filesystem instance to wrap.
51
+ """
52
+
53
+ protocol = "asyncwrapper", "async_wrapper"
54
+ cachable = False
55
+
56
+ def __init__(
57
+ self,
58
+ fs=None,
59
+ asynchronous=None,
60
+ target_protocol=None,
61
+ target_options=None,
62
+ semaphore=None,
63
+ max_concurrent_tasks=None,
64
+ **kwargs,
65
+ ):
66
+ if asynchronous is None:
67
+ asynchronous = running_async()
68
+ super().__init__(asynchronous=asynchronous, **kwargs)
69
+ if fs is not None:
70
+ self.sync_fs = fs
71
+ else:
72
+ self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
73
+ self.protocol = self.sync_fs.protocol
74
+ self.semaphore = semaphore
75
+ self._wrap_all_sync_methods()
76
+
77
+ @property
78
+ def fsid(self):
79
+ return f"async_{self.sync_fs.fsid}"
80
+
81
+ def _wrap_all_sync_methods(self):
82
+ """
83
+ Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
84
+ """
85
+ excluded_methods = {"open"}
86
+ for method_name in dir(self.sync_fs):
87
+ if method_name.startswith("_") or method_name in excluded_methods:
88
+ continue
89
+
90
+ attr = inspect.getattr_static(self.sync_fs, method_name)
91
+ if isinstance(attr, property):
92
+ continue
93
+
94
+ method = getattr(self.sync_fs, method_name)
95
+ if callable(method) and not inspect.iscoroutinefunction(method):
96
+ async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
97
+ setattr(self, f"_{method_name}", async_method)
98
+
99
+ @classmethod
100
+ def wrap_class(cls, sync_fs_class):
101
+ """
102
+ Create a new class that can be used to instantiate an AsyncFileSystemWrapper
103
+ with lazy instantiation of the underlying synchronous filesystem.
104
+
105
+ Parameters
106
+ ----------
107
+ sync_fs_class : type
108
+ The class of the synchronous filesystem to wrap.
109
+
110
+ Returns
111
+ -------
112
+ type
113
+ A new class that wraps the provided synchronous filesystem class.
114
+ """
115
+
116
+ class GeneratedAsyncFileSystemWrapper(cls):
117
+ def __init__(self, *args, **kwargs):
118
+ sync_fs = sync_fs_class(*args, **kwargs)
119
+ super().__init__(sync_fs)
120
+
121
+ GeneratedAsyncFileSystemWrapper.__name__ = (
122
+ f"Async{sync_fs_class.__name__}Wrapper"
123
+ )
124
+ return GeneratedAsyncFileSystemWrapper
venv/lib/python3.10/site-packages/fsspec/implementations/cache_mapper.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import hashlib
5
+
6
+ from fsspec.implementations.local import make_path_posix
7
+
8
+
9
+ class AbstractCacheMapper(abc.ABC):
10
+ """Abstract super-class for mappers from remote URLs to local cached
11
+ basenames.
12
+ """
13
+
14
+ @abc.abstractmethod
15
+ def __call__(self, path: str) -> str: ...
16
+
17
+ def __eq__(self, other: object) -> bool:
18
+ # Identity only depends on class. When derived classes have attributes
19
+ # they will need to be included.
20
+ return isinstance(other, type(self))
21
+
22
+ def __hash__(self) -> int:
23
+ # Identity only depends on class. When derived classes have attributes
24
+ # they will need to be included.
25
+ return hash(type(self))
26
+
27
+
28
+ class BasenameCacheMapper(AbstractCacheMapper):
29
+ """Cache mapper that uses the basename of the remote URL and a fixed number
30
+ of directory levels above this.
31
+
32
+ The default is zero directory levels, meaning different paths with the same
33
+ basename will have the same cached basename.
34
+ """
35
+
36
+ def __init__(self, directory_levels: int = 0):
37
+ if directory_levels < 0:
38
+ raise ValueError(
39
+ "BasenameCacheMapper requires zero or positive directory_levels"
40
+ )
41
+ self.directory_levels = directory_levels
42
+
43
+ # Separator for directories when encoded as strings.
44
+ self._separator = "_@_"
45
+
46
+ def __call__(self, path: str) -> str:
47
+ path = make_path_posix(path)
48
+ prefix, *bits = path.rsplit("/", self.directory_levels + 1)
49
+ if bits:
50
+ return self._separator.join(bits)
51
+ else:
52
+ return prefix # No separator found, simple filename
53
+
54
+ def __eq__(self, other: object) -> bool:
55
+ return super().__eq__(other) and self.directory_levels == other.directory_levels
56
+
57
+ def __hash__(self) -> int:
58
+ return super().__hash__() ^ hash(self.directory_levels)
59
+
60
+
61
+ class HashCacheMapper(AbstractCacheMapper):
62
+ """Cache mapper that uses a hash of the remote URL."""
63
+
64
+ def __call__(self, path: str) -> str:
65
+ return hashlib.sha256(path.encode()).hexdigest()
66
+
67
+
68
+ def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
69
+ """Factory method to create cache mapper for backward compatibility with
70
+ ``CachingFileSystem`` constructor using ``same_names`` kwarg.
71
+ """
72
+ if same_names:
73
+ return BasenameCacheMapper()
74
+ else:
75
+ return HashCacheMapper()
venv/lib/python3.10/site-packages/fsspec/implementations/cache_metadata.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+ import time
6
+ from typing import TYPE_CHECKING
7
+
8
+ from fsspec.utils import atomic_write
9
+
10
+ try:
11
+ import ujson as json
12
+ except ImportError:
13
+ if not TYPE_CHECKING:
14
+ import json
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Iterator
18
+ from typing import Any, Literal, TypeAlias
19
+
20
+ from .cached import CachingFileSystem
21
+
22
+ Detail: TypeAlias = dict[str, Any]
23
+
24
+
25
+ class CacheMetadata:
26
+ """Cache metadata.
27
+
28
+ All reading and writing of cache metadata is performed by this class,
29
+ accessing the cached files and blocks is not.
30
+
31
+ Metadata is stored in a single file per storage directory in JSON format.
32
+ For backward compatibility, also reads metadata stored in pickle format
33
+ which is converted to JSON when next saved.
34
+ """
35
+
36
+ def __init__(self, storage: list[str]):
37
+ """
38
+
39
+ Parameters
40
+ ----------
41
+ storage: list[str]
42
+ Directories containing cached files, must be at least one. Metadata
43
+ is stored in the last of these directories by convention.
44
+ """
45
+ if not storage:
46
+ raise ValueError("CacheMetadata expects at least one storage location")
47
+
48
+ self._storage = storage
49
+ self.cached_files: list[Detail] = [{}]
50
+
51
+ # Private attribute to force saving of metadata in pickle format rather than
52
+ # JSON for use in tests to confirm can read both pickle and JSON formats.
53
+ self._force_save_pickle = False
54
+
55
+ def _load(self, fn: str) -> Detail:
56
+ """Low-level function to load metadata from specific file"""
57
+ try:
58
+ with open(fn, "r") as f:
59
+ loaded = json.load(f)
60
+ except ValueError:
61
+ with open(fn, "rb") as f:
62
+ loaded = pickle.load(f)
63
+ for c in loaded.values():
64
+ if isinstance(c.get("blocks"), list):
65
+ c["blocks"] = set(c["blocks"])
66
+ return loaded
67
+
68
+ def _save(self, metadata_to_save: Detail, fn: str) -> None:
69
+ """Low-level function to save metadata to specific file"""
70
+ if self._force_save_pickle:
71
+ with atomic_write(fn) as f:
72
+ pickle.dump(metadata_to_save, f)
73
+ else:
74
+ with atomic_write(fn, mode="w") as f:
75
+ json.dump(metadata_to_save, f)
76
+
77
+ def _scan_locations(
78
+ self, writable_only: bool = False
79
+ ) -> Iterator[tuple[str, str, bool]]:
80
+ """Yield locations (filenames) where metadata is stored, and whether
81
+ writable or not.
82
+
83
+ Parameters
84
+ ----------
85
+ writable: bool
86
+ Set to True to only yield writable locations.
87
+
88
+ Returns
89
+ -------
90
+ Yields (str, str, bool)
91
+ """
92
+ n = len(self._storage)
93
+ for i, storage in enumerate(self._storage):
94
+ writable = i == n - 1
95
+ if writable_only and not writable:
96
+ continue
97
+ yield os.path.join(storage, "cache"), storage, writable
98
+
99
+ def check_file(
100
+ self, path: str, cfs: CachingFileSystem | None
101
+ ) -> Literal[False] | tuple[Detail, str]:
102
+ """If path is in cache return its details, otherwise return ``False``.
103
+
104
+ If the optional CachingFileSystem is specified then it is used to
105
+ perform extra checks to reject possible matches, such as if they are
106
+ too old.
107
+ """
108
+ for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
109
+ if path not in cache:
110
+ continue
111
+ detail = cache[path].copy()
112
+
113
+ if cfs is not None:
114
+ if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
115
+ # Wrong file as determined by hash of file properties
116
+ continue
117
+ if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
118
+ # Cached file has expired
119
+ continue
120
+
121
+ fn = os.path.join(base, detail["fn"])
122
+ if os.path.exists(fn):
123
+ return detail, fn
124
+ return False
125
+
126
+ def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
127
+ """Remove expired metadata from the cache.
128
+
129
+ Returns names of files corresponding to expired metadata and a boolean
130
+ flag indicating whether the writable cache is empty. Caller is
131
+ responsible for deleting the expired files.
132
+ """
133
+ expired_files = []
134
+ for path, detail in self.cached_files[-1].copy().items():
135
+ if time.time() - detail["time"] > expiry_time:
136
+ fn = detail.get("fn", "")
137
+ if not fn:
138
+ raise RuntimeError(
139
+ f"Cache metadata does not contain 'fn' for {path}"
140
+ )
141
+ fn = os.path.join(self._storage[-1], fn)
142
+ expired_files.append(fn)
143
+ self.cached_files[-1].pop(path)
144
+
145
+ if self.cached_files[-1]:
146
+ cache_path = os.path.join(self._storage[-1], "cache")
147
+ self._save(self.cached_files[-1], cache_path)
148
+
149
+ writable_cache_empty = not self.cached_files[-1]
150
+ return expired_files, writable_cache_empty
151
+
152
+ def load(self) -> None:
153
+ """Load all metadata from disk and store in ``self.cached_files``"""
154
+ cached_files = []
155
+ for fn, _, _ in self._scan_locations():
156
+ if os.path.exists(fn):
157
+ # TODO: consolidate blocks here
158
+ cached_files.append(self._load(fn))
159
+ else:
160
+ cached_files.append({})
161
+ self.cached_files = cached_files or [{}]
162
+
163
+ def on_close_cached_file(self, f: Any, path: str) -> None:
164
+ """Perform side-effect actions on closing a cached file.
165
+
166
+ The actual closing of the file is the responsibility of the caller.
167
+ """
168
+ # File must be writeble, so in self.cached_files[-1]
169
+ c = self.cached_files[-1][path]
170
+ if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
171
+ c["blocks"] = True
172
+
173
+ def pop_file(self, path: str) -> str | None:
174
+ """Remove metadata of cached file.
175
+
176
+ If path is in the cache, return the filename of the cached file,
177
+ otherwise return ``None``. Caller is responsible for deleting the
178
+ cached file.
179
+ """
180
+ details = self.check_file(path, None)
181
+ if not details:
182
+ return None
183
+ _, fn = details
184
+ if fn.startswith(self._storage[-1]):
185
+ self.cached_files[-1].pop(path)
186
+ self.save()
187
+ else:
188
+ raise PermissionError(
189
+ "Can only delete cached file in last, writable cache location"
190
+ )
191
+ return fn
192
+
193
+ def save(self) -> None:
194
+ """Save metadata to disk"""
195
+ for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
196
+ if not writable:
197
+ continue
198
+
199
+ if os.path.exists(fn):
200
+ cached_files = self._load(fn)
201
+ for k, c in cached_files.items():
202
+ if k in cache:
203
+ if c["blocks"] is True or cache[k]["blocks"] is True:
204
+ c["blocks"] = True
205
+ else:
206
+ # self.cached_files[*][*]["blocks"] must continue to
207
+ # point to the same set object so that updates
208
+ # performed by MMapCache are propagated back to
209
+ # self.cached_files.
210
+ blocks = cache[k]["blocks"]
211
+ blocks.update(c["blocks"])
212
+ c["blocks"] = blocks
213
+ c["time"] = max(c["time"], cache[k]["time"])
214
+ c["uid"] = cache[k]["uid"]
215
+
216
+ # Files can be added to cache after it was written once
217
+ for k, c in cache.items():
218
+ if k not in cached_files:
219
+ cached_files[k] = c
220
+ else:
221
+ cached_files = cache
222
+ cache = {k: v.copy() for k, v in cached_files.items()}
223
+ for c in cache.values():
224
+ if isinstance(c["blocks"], set):
225
+ c["blocks"] = list(c["blocks"])
226
+ self._save(cache, fn)
227
+ self.cached_files[-1] = cached_files
228
+
229
+ def update_file(self, path: str, detail: Detail) -> None:
230
+ """Update metadata for specific file in memory, do not save"""
231
+ self.cached_files[-1][path] = detail
venv/lib/python3.10/site-packages/fsspec/implementations/cached.py ADDED
@@ -0,0 +1,1021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import tempfile
7
+ import time
8
+ import weakref
9
+ from collections.abc import Callable
10
+ from shutil import rmtree
11
+ from typing import TYPE_CHECKING, Any, ClassVar
12
+
13
+ from fsspec import filesystem
14
+ from fsspec.callbacks import DEFAULT_CALLBACK
15
+ from fsspec.compression import compr
16
+ from fsspec.core import BaseCache, MMapCache
17
+ from fsspec.exceptions import BlocksizeMismatchError
18
+ from fsspec.implementations.cache_mapper import create_cache_mapper
19
+ from fsspec.implementations.cache_metadata import CacheMetadata
20
+ from fsspec.implementations.chained import ChainedFileSystem
21
+ from fsspec.implementations.local import LocalFileSystem
22
+ from fsspec.spec import AbstractBufferedFile
23
+ from fsspec.transaction import Transaction
24
+ from fsspec.utils import infer_compression
25
+
26
+ if TYPE_CHECKING:
27
+ from fsspec.implementations.cache_mapper import AbstractCacheMapper
28
+
29
+ logger = logging.getLogger("fsspec.cached")
30
+
31
+
32
+ class WriteCachedTransaction(Transaction):
33
+ def complete(self, commit=True):
34
+ rpaths = [f.path for f in self.files]
35
+ lpaths = [f.fn for f in self.files]
36
+ if commit:
37
+ self.fs.put(lpaths, rpaths)
38
+ self.files.clear()
39
+ self.fs._intrans = False
40
+ self.fs._transaction = None
41
+ self.fs = None # break cycle
42
+
43
+
44
+ class CachingFileSystem(ChainedFileSystem):
45
+ """Locally caching filesystem, layer over any other FS
46
+
47
+ This class implements chunk-wise local storage of remote files, for quick
48
+ access after the initial download. The files are stored in a given
49
+ directory with hashes of URLs for the filenames. If no directory is given,
50
+ a temporary one is used, which should be cleaned up by the OS after the
51
+ process ends. The files themselves are sparse (as implemented in
52
+ :class:`~fsspec.caching.MMapCache`), so only the data which is accessed
53
+ takes up space.
54
+
55
+ Restrictions:
56
+
57
+ - the block-size must be the same for each access of a given file, unless
58
+ all blocks of the file have already been read
59
+ - caching can only be applied to file-systems which produce files
60
+ derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
61
+ allowed, for testing
62
+ """
63
+
64
+ protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
65
+ _strip_tokenize_options = ("fo",)
66
+
67
+ def __init__(
68
+ self,
69
+ target_protocol=None,
70
+ cache_storage="TMP",
71
+ cache_check=10,
72
+ check_files=False,
73
+ expiry_time=604800,
74
+ target_options=None,
75
+ fs=None,
76
+ same_names: bool | None = None,
77
+ compression=None,
78
+ cache_mapper: AbstractCacheMapper | None = None,
79
+ **kwargs,
80
+ ):
81
+ """
82
+
83
+ Parameters
84
+ ----------
85
+ target_protocol: str (optional)
86
+ Target filesystem protocol. Provide either this or ``fs``.
87
+ cache_storage: str or list(str)
88
+ Location to store files. If "TMP", this is a temporary directory,
89
+ and will be cleaned up by the OS when this process ends (or later).
90
+ If a list, each location will be tried in the order given, but
91
+ only the last will be considered writable.
92
+ cache_check: int
93
+ Number of seconds between reload of cache metadata
94
+ check_files: bool
95
+ Whether to explicitly see if the UID of the remote file matches
96
+ the stored one before using. Warning: some file systems such as
97
+ HTTP cannot reliably give a unique hash of the contents of some
98
+ path, so be sure to set this option to False.
99
+ expiry_time: int
100
+ The time in seconds after which a local copy is considered useless.
101
+ Set to falsy to prevent expiry. The default is equivalent to one
102
+ week.
103
+ target_options: dict or None
104
+ Passed to the instantiation of the FS, if fs is None.
105
+ fs: filesystem instance
106
+ The target filesystem to run against. Provide this or ``protocol``.
107
+ same_names: bool (optional)
108
+ By default, target URLs are hashed using a ``HashCacheMapper`` so
109
+ that files from different backends with the same basename do not
110
+ conflict. If this argument is ``true``, a ``BasenameCacheMapper``
111
+ is used instead. Other cache mapper options are available by using
112
+ the ``cache_mapper`` keyword argument. Only one of this and
113
+ ``cache_mapper`` should be specified.
114
+ compression: str (optional)
115
+ To decompress on download. Can be 'infer' (guess from the URL name),
116
+ one of the entries in ``fsspec.compression.compr``, or None for no
117
+ decompression.
118
+ cache_mapper: AbstractCacheMapper (optional)
119
+ The object use to map from original filenames to cached filenames.
120
+ Only one of this and ``same_names`` should be specified.
121
+ """
122
+ super().__init__(**kwargs)
123
+ if fs is None and target_protocol is None:
124
+ raise ValueError(
125
+ "Please provide filesystem instance(fs) or target_protocol"
126
+ )
127
+ if not (fs is None) ^ (target_protocol is None):
128
+ raise ValueError(
129
+ "Both filesystems (fs) and target_protocol may not be both given."
130
+ )
131
+ if cache_storage == "TMP":
132
+ tempdir = tempfile.mkdtemp()
133
+ storage = [tempdir]
134
+ weakref.finalize(self, self._remove_tempdir, tempdir)
135
+ else:
136
+ if isinstance(cache_storage, str):
137
+ storage = [cache_storage]
138
+ else:
139
+ storage = cache_storage
140
+ os.makedirs(storage[-1], exist_ok=True)
141
+ self.storage = storage
142
+ self.kwargs = target_options or {}
143
+ self.cache_check = cache_check
144
+ self.check_files = check_files
145
+ self.expiry = expiry_time
146
+ self.compression = compression
147
+
148
+ # Size of cache in bytes. If None then the size is unknown and will be
149
+ # recalculated the next time cache_size() is called. On writes to the
150
+ # cache this is reset to None.
151
+ self._cache_size = None
152
+
153
+ if same_names is not None and cache_mapper is not None:
154
+ raise ValueError(
155
+ "Cannot specify both same_names and cache_mapper in "
156
+ "CachingFileSystem.__init__"
157
+ )
158
+ if cache_mapper is not None:
159
+ self._mapper = cache_mapper
160
+ else:
161
+ self._mapper = create_cache_mapper(
162
+ same_names if same_names is not None else False
163
+ )
164
+
165
+ self.target_protocol = (
166
+ target_protocol
167
+ if isinstance(target_protocol, str)
168
+ else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
169
+ )
170
+ self._metadata = CacheMetadata(self.storage)
171
+ self.load_cache()
172
+ self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
173
+
174
+ def _strip_protocol(path):
175
+ # acts as a method, since each instance has a difference target
176
+ return self.fs._strip_protocol(type(self)._strip_protocol(path))
177
+
178
+ self._strip_protocol: Callable = _strip_protocol
179
+
180
+ @staticmethod
181
+ def _remove_tempdir(tempdir):
182
+ try:
183
+ rmtree(tempdir)
184
+ except Exception:
185
+ pass
186
+
187
+ def _mkcache(self):
188
+ os.makedirs(self.storage[-1], exist_ok=True)
189
+
190
+ def cache_size(self):
191
+ """Return size of cache in bytes.
192
+
193
+ If more than one cache directory is in use, only the size of the last
194
+ one (the writable cache directory) is returned.
195
+ """
196
+ if self._cache_size is None:
197
+ cache_dir = self.storage[-1]
198
+ self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
199
+ return self._cache_size
200
+
201
+ def load_cache(self):
202
+ """Read set of stored blocks from file"""
203
+ self._metadata.load()
204
+ self._mkcache()
205
+ self.last_cache = time.time()
206
+
207
+ def save_cache(self):
208
+ """Save set of stored blocks from file"""
209
+ self._mkcache()
210
+ self._metadata.save()
211
+ self.last_cache = time.time()
212
+ self._cache_size = None
213
+
214
+ def _check_cache(self):
215
+ """Reload caches if time elapsed or any disappeared"""
216
+ self._mkcache()
217
+ if not self.cache_check:
218
+ # explicitly told not to bother checking
219
+ return
220
+ timecond = time.time() - self.last_cache > self.cache_check
221
+ existcond = all(os.path.exists(storage) for storage in self.storage)
222
+ if timecond or not existcond:
223
+ self.load_cache()
224
+
225
+ def _check_file(self, path):
226
+ """Is path in cache and still valid"""
227
+ path = self._strip_protocol(path)
228
+ self._check_cache()
229
+ return self._metadata.check_file(path, self)
230
+
231
+ def clear_cache(self):
232
+ """Remove all files and metadata from the cache
233
+
234
+ In the case of multiple cache locations, this clears only the last one,
235
+ which is assumed to be the read/write one.
236
+ """
237
+ rmtree(self.storage[-1])
238
+ self.load_cache()
239
+ self._cache_size = None
240
+
241
+ def clear_expired_cache(self, expiry_time=None):
242
+ """Remove all expired files and metadata from the cache
243
+
244
+ In the case of multiple cache locations, this clears only the last one,
245
+ which is assumed to be the read/write one.
246
+
247
+ Parameters
248
+ ----------
249
+ expiry_time: int
250
+ The time in seconds after which a local copy is considered useless.
251
+ If not defined the default is equivalent to the attribute from the
252
+ file caching instantiation.
253
+ """
254
+
255
+ if not expiry_time:
256
+ expiry_time = self.expiry
257
+
258
+ self._check_cache()
259
+
260
+ expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
261
+ for fn in expired_files:
262
+ if os.path.exists(fn):
263
+ os.remove(fn)
264
+
265
+ if writable_cache_empty:
266
+ rmtree(self.storage[-1])
267
+ self.load_cache()
268
+
269
+ self._cache_size = None
270
+
271
+ def pop_from_cache(self, path):
272
+ """Remove cached version of given file
273
+
274
+ Deletes local copy of the given (remote) path. If it is found in a cache
275
+ location which is not the last, it is assumed to be read-only, and
276
+ raises PermissionError
277
+ """
278
+ path = self._strip_protocol(path)
279
+ fn = self._metadata.pop_file(path)
280
+ if fn is not None:
281
+ os.remove(fn)
282
+ self._cache_size = None
283
+
284
+ def _open(
285
+ self,
286
+ path,
287
+ mode="rb",
288
+ block_size=None,
289
+ autocommit=True,
290
+ cache_options=None,
291
+ **kwargs,
292
+ ):
293
+ """Wrap the target _open
294
+
295
+ If the whole file exists in the cache, just open it locally and
296
+ return that.
297
+
298
+ Otherwise, open the file on the target FS, and make it have a mmap
299
+ cache pointing to the location which we determine, in our cache.
300
+ The ``blocks`` instance is shared, so as the mmap cache instance
301
+ updates, so does the entry in our ``cached_files`` attribute.
302
+ We monkey-patch this file, so that when it closes, we call
303
+ ``close_and_update`` to save the state of the blocks.
304
+ """
305
+ path = self._strip_protocol(path)
306
+
307
+ path = self.fs._strip_protocol(path)
308
+ if "r" not in mode:
309
+ return self.fs._open(
310
+ path,
311
+ mode=mode,
312
+ block_size=block_size,
313
+ autocommit=autocommit,
314
+ cache_options=cache_options,
315
+ **kwargs,
316
+ )
317
+ detail = self._check_file(path)
318
+ if detail:
319
+ # file is in cache
320
+ detail, fn = detail
321
+ hash, blocks = detail["fn"], detail["blocks"]
322
+ if blocks is True:
323
+ # stored file is complete
324
+ logger.debug("Opening local copy of %s", path)
325
+ return open(fn, mode)
326
+ # TODO: action where partial file exists in read-only cache
327
+ logger.debug("Opening partially cached copy of %s", path)
328
+ else:
329
+ hash = self._mapper(path)
330
+ fn = os.path.join(self.storage[-1], hash)
331
+ blocks = set()
332
+ detail = {
333
+ "original": path,
334
+ "fn": hash,
335
+ "blocks": blocks,
336
+ "time": time.time(),
337
+ "uid": self.fs.ukey(path),
338
+ }
339
+ self._metadata.update_file(path, detail)
340
+ logger.debug("Creating local sparse file for %s", path)
341
+
342
+ # explicitly submitting the size to the open call will avoid extra
343
+ # operations when opening. This is particularly relevant
344
+ # for any file that is read over a network, e.g. S3.
345
+ size = detail.get("size")
346
+
347
+ # call target filesystems open
348
+ self._mkcache()
349
+ f = self.fs._open(
350
+ path,
351
+ mode=mode,
352
+ block_size=block_size,
353
+ autocommit=autocommit,
354
+ cache_options=cache_options,
355
+ cache_type="none",
356
+ size=size,
357
+ **kwargs,
358
+ )
359
+
360
+ # set size if not already set
361
+ if size is None:
362
+ detail["size"] = f.size
363
+ self._metadata.update_file(path, detail)
364
+
365
+ if self.compression:
366
+ comp = (
367
+ infer_compression(path)
368
+ if self.compression == "infer"
369
+ else self.compression
370
+ )
371
+ f = compr[comp](f, mode="rb")
372
+ if "blocksize" in detail:
373
+ if detail["blocksize"] != f.blocksize:
374
+ raise BlocksizeMismatchError(
375
+ f"Cached file must be reopened with same block"
376
+ f" size as original (old: {detail['blocksize']},"
377
+ f" new {f.blocksize})"
378
+ )
379
+ else:
380
+ detail["blocksize"] = f.blocksize
381
+
382
+ def _fetch_ranges(ranges):
383
+ return self.fs.cat_ranges(
384
+ [path] * len(ranges),
385
+ [r[0] for r in ranges],
386
+ [r[1] for r in ranges],
387
+ **kwargs,
388
+ )
389
+
390
+ multi_fetcher = None if self.compression else _fetch_ranges
391
+ f.cache = MMapCache(
392
+ f.blocksize, f._fetch_range, f.size, fn, blocks, multi_fetcher=multi_fetcher
393
+ )
394
+ close = f.close
395
+ f.close = lambda: self.close_and_update(f, close)
396
+ self.save_cache()
397
+ return f
398
+
399
+ def _parent(self, path):
400
+ return self.fs._parent(path)
401
+
402
+ def hash_name(self, path: str, *args: Any) -> str:
403
+ # Kept for backward compatibility with downstream libraries.
404
+ # Ignores extra arguments, previously same_name boolean.
405
+ return self._mapper(path)
406
+
407
+ def close_and_update(self, f, close):
408
+ """Called when a file is closing, so store the set of blocks"""
409
+ if f.closed:
410
+ return
411
+ path = self._strip_protocol(f.path)
412
+ self._metadata.on_close_cached_file(f, path)
413
+ try:
414
+ logger.debug("going to save")
415
+ self.save_cache()
416
+ logger.debug("saved")
417
+ except OSError:
418
+ logger.debug("Cache saving failed while closing file")
419
+ except NameError:
420
+ logger.debug("Cache save failed due to interpreter shutdown")
421
+ close()
422
+ f.closed = True
423
+
424
+ def ls(self, path, detail=True):
425
+ return self.fs.ls(path, detail)
426
+
427
+ def __getattribute__(self, item):
428
+ if item in {
429
+ "load_cache",
430
+ "_get_cached_file_before_open",
431
+ "_open",
432
+ "save_cache",
433
+ "close_and_update",
434
+ "__init__",
435
+ "__getattribute__",
436
+ "__reduce__",
437
+ "_make_local_details",
438
+ "open",
439
+ "cat",
440
+ "cat_file",
441
+ "_cat_file",
442
+ "cat_ranges",
443
+ "_cat_ranges",
444
+ "get",
445
+ "read_block",
446
+ "tail",
447
+ "head",
448
+ "info",
449
+ "ls",
450
+ "exists",
451
+ "isfile",
452
+ "isdir",
453
+ "_check_file",
454
+ "_check_cache",
455
+ "_mkcache",
456
+ "clear_cache",
457
+ "clear_expired_cache",
458
+ "pop_from_cache",
459
+ "local_file",
460
+ "_paths_from_path",
461
+ "get_mapper",
462
+ "open_many",
463
+ "commit_many",
464
+ "hash_name",
465
+ "__hash__",
466
+ "__eq__",
467
+ "to_json",
468
+ "to_dict",
469
+ "cache_size",
470
+ "pipe_file",
471
+ "pipe",
472
+ "start_transaction",
473
+ "end_transaction",
474
+ }:
475
+ # all the methods defined in this class. Note `open` here, since
476
+ # it calls `_open`, but is actually in superclass
477
+ return lambda *args, **kw: getattr(type(self), item).__get__(self)(
478
+ *args, **kw
479
+ )
480
+ if item in ["__reduce_ex__"]:
481
+ raise AttributeError
482
+ if item in ["transaction"]:
483
+ # property
484
+ return type(self).transaction.__get__(self)
485
+ if item in {"_cache", "transaction_type", "protocol"}:
486
+ # class attributes
487
+ return getattr(type(self), item)
488
+ if item == "__class__":
489
+ return type(self)
490
+ d = object.__getattribute__(self, "__dict__")
491
+ fs = d.get("fs", None) # fs is not immediately defined
492
+ if item in d:
493
+ return d[item]
494
+ elif fs is not None:
495
+ if item in fs.__dict__:
496
+ # attribute of instance
497
+ return fs.__dict__[item]
498
+ # attributed belonging to the target filesystem
499
+ cls = type(fs)
500
+ m = getattr(cls, item)
501
+ if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
502
+ not hasattr(m, "__self__") or m.__self__ is None
503
+ ):
504
+ # instance method
505
+ return m.__get__(fs, cls)
506
+ return m # class method or attribute
507
+ else:
508
+ # attributes of the superclass, while target is being set up
509
+ return super().__getattribute__(item)
510
+
511
+ def __eq__(self, other):
512
+ """Test for equality."""
513
+ if self is other:
514
+ return True
515
+ if not isinstance(other, type(self)):
516
+ return False
517
+ return (
518
+ self.storage == other.storage
519
+ and self.kwargs == other.kwargs
520
+ and self.cache_check == other.cache_check
521
+ and self.check_files == other.check_files
522
+ and self.expiry == other.expiry
523
+ and self.compression == other.compression
524
+ and self._mapper == other._mapper
525
+ and self.target_protocol == other.target_protocol
526
+ )
527
+
528
+ def __hash__(self):
529
+ """Calculate hash."""
530
+ return (
531
+ hash(tuple(self.storage))
532
+ ^ hash(str(self.kwargs))
533
+ ^ hash(self.cache_check)
534
+ ^ hash(self.check_files)
535
+ ^ hash(self.expiry)
536
+ ^ hash(self.compression)
537
+ ^ hash(self._mapper)
538
+ ^ hash(self.target_protocol)
539
+ )
540
+
541
+
542
+ class WholeFileCacheFileSystem(CachingFileSystem):
543
+ """Caches whole remote files on first access
544
+
545
+ This class is intended as a layer over any other file system, and
546
+ will make a local copy of each file accessed, so that all subsequent
547
+ reads are local. This is similar to ``CachingFileSystem``, but without
548
+ the block-wise functionality and so can work even when sparse files
549
+ are not allowed. See its docstring for definition of the init
550
+ arguments.
551
+
552
+ The class still needs access to the remote store for listing files,
553
+ and may refresh cached files.
554
+ """
555
+
556
+ protocol = "filecache"
557
+ local_file = True
558
+
559
+ def open_many(self, open_files, **kwargs):
560
+ paths = [of.path for of in open_files]
561
+ if "r" in open_files.mode:
562
+ self._mkcache()
563
+ else:
564
+ return [
565
+ LocalTempFile(
566
+ self.fs,
567
+ path,
568
+ mode=open_files.mode,
569
+ fn=os.path.join(self.storage[-1], self._mapper(path)),
570
+ **kwargs,
571
+ )
572
+ for path in paths
573
+ ]
574
+
575
+ if self.compression:
576
+ raise NotImplementedError
577
+ details = [self._check_file(sp) for sp in paths]
578
+ downpath = [p for p, d in zip(paths, details) if not d]
579
+ downfn0 = [
580
+ os.path.join(self.storage[-1], self._mapper(p))
581
+ for p, d in zip(paths, details)
582
+ ] # keep these path names for opening later
583
+ downfn = [fn for fn, d in zip(downfn0, details) if not d]
584
+ if downpath:
585
+ # skip if all files are already cached and up to date
586
+ self.fs.get(downpath, downfn)
587
+
588
+ # update metadata - only happens when downloads are successful
589
+ newdetail = [
590
+ {
591
+ "original": path,
592
+ "fn": self._mapper(path),
593
+ "blocks": True,
594
+ "time": time.time(),
595
+ "uid": self.fs.ukey(path),
596
+ }
597
+ for path in downpath
598
+ ]
599
+ for path, detail in zip(downpath, newdetail):
600
+ self._metadata.update_file(path, detail)
601
+ self.save_cache()
602
+
603
+ def firstpart(fn):
604
+ # helper to adapt both whole-file and simple-cache
605
+ return fn[1] if isinstance(fn, tuple) else fn
606
+
607
+ return [
608
+ open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
609
+ for fn0, fn1 in zip(details, downfn0)
610
+ ]
611
+
612
+ def commit_many(self, open_files):
613
+ self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
614
+ [f.close() for f in open_files]
615
+ for f in open_files:
616
+ # in case autocommit is off, and so close did not already delete
617
+ try:
618
+ os.remove(f.name)
619
+ except FileNotFoundError:
620
+ pass
621
+ self._cache_size = None
622
+
623
+ def _make_local_details(self, path):
624
+ hash = self._mapper(path)
625
+ fn = os.path.join(self.storage[-1], hash)
626
+ detail = {
627
+ "original": path,
628
+ "fn": hash,
629
+ "blocks": True,
630
+ "time": time.time(),
631
+ "uid": self.fs.ukey(path),
632
+ }
633
+ self._metadata.update_file(path, detail)
634
+ logger.debug("Copying %s to local cache", path)
635
+ return fn
636
+
637
+ def cat(
638
+ self,
639
+ path,
640
+ recursive=False,
641
+ on_error="raise",
642
+ callback=DEFAULT_CALLBACK,
643
+ **kwargs,
644
+ ):
645
+ paths = self.expand_path(
646
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
647
+ )
648
+ getpaths = []
649
+ storepaths = []
650
+ fns = []
651
+ out = {}
652
+ for p in paths.copy():
653
+ try:
654
+ detail = self._check_file(p)
655
+ if not detail:
656
+ fn = self._make_local_details(p)
657
+ getpaths.append(p)
658
+ storepaths.append(fn)
659
+ else:
660
+ detail, fn = detail if isinstance(detail, tuple) else (None, detail)
661
+ fns.append(fn)
662
+ except Exception as e:
663
+ if on_error == "raise":
664
+ raise
665
+ if on_error == "return":
666
+ out[p] = e
667
+ paths.remove(p)
668
+
669
+ if getpaths:
670
+ self.fs.get(getpaths, storepaths)
671
+ self.save_cache()
672
+
673
+ callback.set_size(len(paths))
674
+ for p, fn in zip(paths, fns):
675
+ with open(fn, "rb") as f:
676
+ out[p] = f.read()
677
+ callback.relative_update(1)
678
+ if isinstance(path, str) and len(paths) == 1 and recursive is False:
679
+ out = out[paths[0]]
680
+ return out
681
+
682
+ def _get_cached_file_before_open(self, path, **kwargs):
683
+ fn = self._make_local_details(path)
684
+ # call target filesystems open
685
+ self._mkcache()
686
+ if self.compression:
687
+ with self.fs._open(path, mode="rb", **kwargs) as f, open(fn, "wb") as f2:
688
+ if isinstance(f, AbstractBufferedFile):
689
+ # want no type of caching if just downloading whole thing
690
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
691
+ comp = (
692
+ infer_compression(path)
693
+ if self.compression == "infer"
694
+ else self.compression
695
+ )
696
+ f = compr[comp](f, mode="rb")
697
+ data = True
698
+ while data:
699
+ block = getattr(f, "blocksize", 5 * 2**20)
700
+ data = f.read(block)
701
+ f2.write(data)
702
+ else:
703
+ self.fs.get_file(path, fn)
704
+ self.save_cache()
705
+
706
+ def _open(self, path, mode="rb", **kwargs):
707
+ path = self._strip_protocol(path)
708
+ # For read (or append), (try) download from remote
709
+ if "r" in mode or "a" in mode:
710
+ if not self._check_file(path):
711
+ if self.fs.exists(path):
712
+ self._get_cached_file_before_open(path, **kwargs)
713
+ elif "r" in mode:
714
+ raise FileNotFoundError(path)
715
+
716
+ detail, fn = self._check_file(path)
717
+ _, blocks = detail["fn"], detail["blocks"]
718
+ if blocks is True:
719
+ logger.debug("Opening local copy of %s", path)
720
+ else:
721
+ raise ValueError(
722
+ f"Attempt to open partially cached file {path}"
723
+ f" as a wholly cached file"
724
+ )
725
+
726
+ # Just reading does not need special file handling
727
+ if "r" in mode and "+" not in mode:
728
+ # In order to support downstream filesystems to be able to
729
+ # infer the compression from the original filename, like
730
+ # the `TarFileSystem`, let's extend the `io.BufferedReader`
731
+ # fileobject protocol by adding a dedicated attribute
732
+ # `original`.
733
+ f = open(fn, mode)
734
+ f.original = detail.get("original")
735
+ return f
736
+
737
+ hash = self._mapper(path)
738
+ fn = os.path.join(self.storage[-1], hash)
739
+ user_specified_kwargs = {
740
+ k: v
741
+ for k, v in kwargs.items()
742
+ # those kwargs were added by open(), we don't want them
743
+ if k not in ["autocommit", "block_size", "cache_options"]
744
+ }
745
+ return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
746
+
747
+
748
+ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
749
+ """Caches whole remote files on first access
750
+
751
+ This class is intended as a layer over any other file system, and
752
+ will make a local copy of each file accessed, so that all subsequent
753
+ reads are local. This implementation only copies whole files, and
754
+ does not keep any metadata about the download time or file details.
755
+ It is therefore safer to use in multi-threaded/concurrent situations.
756
+
757
+ This is the only of the caching filesystems that supports write: you will
758
+ be given a real local open file, and upon close and commit, it will be
759
+ uploaded to the target filesystem; the writability or the target URL is
760
+ not checked until that time.
761
+
762
+ """
763
+
764
+ protocol = "simplecache"
765
+ local_file = True
766
+ transaction_type = WriteCachedTransaction
767
+
768
+ def __init__(self, **kwargs):
769
+ kw = kwargs.copy()
770
+ for key in ["cache_check", "expiry_time", "check_files"]:
771
+ kw[key] = False
772
+ super().__init__(**kw)
773
+ for storage in self.storage:
774
+ if not os.path.exists(storage):
775
+ os.makedirs(storage, exist_ok=True)
776
+
777
+ def _check_file(self, path):
778
+ self._check_cache()
779
+ sha = self._mapper(path)
780
+ for storage in self.storage:
781
+ fn = os.path.join(storage, sha)
782
+ if os.path.exists(fn):
783
+ return fn
784
+
785
+ def save_cache(self):
786
+ pass
787
+
788
+ def load_cache(self):
789
+ pass
790
+
791
+ def pipe_file(self, path, value=None, **kwargs):
792
+ if self._intrans:
793
+ with self.open(path, "wb") as f:
794
+ f.write(value)
795
+ else:
796
+ super().pipe_file(path, value)
797
+
798
+ def ls(self, path, detail=True, **kwargs):
799
+ path = self._strip_protocol(path)
800
+ details = []
801
+ try:
802
+ details = self.fs.ls(
803
+ path, detail=True, **kwargs
804
+ ).copy() # don't edit original!
805
+ except FileNotFoundError as e:
806
+ ex = e
807
+ else:
808
+ ex = None
809
+ if self._intrans:
810
+ path1 = path.rstrip("/") + "/"
811
+ for f in self.transaction.files:
812
+ if f.path == path:
813
+ details.append(
814
+ {"name": path, "size": f.size or f.tell(), "type": "file"}
815
+ )
816
+ elif f.path.startswith(path1):
817
+ if f.path.count("/") == path1.count("/"):
818
+ details.append(
819
+ {"name": f.path, "size": f.size or f.tell(), "type": "file"}
820
+ )
821
+ else:
822
+ dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
823
+ details.append({"name": dname, "size": 0, "type": "directory"})
824
+ if ex is not None and not details:
825
+ raise ex
826
+ if detail:
827
+ return details
828
+ return sorted(_["name"] for _ in details)
829
+
830
+ def info(self, path, **kwargs):
831
+ path = self._strip_protocol(path)
832
+ if self._intrans:
833
+ f = [_ for _ in self.transaction.files if _.path == path]
834
+ if f:
835
+ size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell()
836
+ return {"name": path, "size": size, "type": "file"}
837
+ f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
838
+ if f:
839
+ return {"name": path, "size": 0, "type": "directory"}
840
+ return self.fs.info(path, **kwargs)
841
+
842
+ def pipe(self, path, value=None, **kwargs):
843
+ if isinstance(path, str):
844
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
845
+ elif isinstance(path, dict):
846
+ for k, v in path.items():
847
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
848
+ else:
849
+ raise ValueError("path must be str or dict")
850
+
851
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
852
+ logger.debug("async cat_file %s", path)
853
+ path = self._strip_protocol(path)
854
+ sha = self._mapper(path)
855
+ fn = self._check_file(path)
856
+
857
+ if not fn:
858
+ fn = os.path.join(self.storage[-1], sha)
859
+ await self.fs._get_file(path, fn, **kwargs)
860
+
861
+ with open(fn, "rb") as f: # noqa ASYNC230
862
+ if start:
863
+ f.seek(start)
864
+ size = -1 if end is None else end - f.tell()
865
+ return f.read(size)
866
+
867
+ async def _cat_ranges(
868
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
869
+ ):
870
+ logger.debug("async cat ranges %s", paths)
871
+ lpaths = []
872
+ rset = set()
873
+ download = []
874
+ rpaths = []
875
+ for p in paths:
876
+ fn = self._check_file(p)
877
+ if fn is None and p not in rset:
878
+ sha = self._mapper(p)
879
+ fn = os.path.join(self.storage[-1], sha)
880
+ download.append(fn)
881
+ rset.add(p)
882
+ rpaths.append(p)
883
+ lpaths.append(fn)
884
+ if download:
885
+ await self.fs._get(rpaths, download, on_error=on_error)
886
+
887
+ return LocalFileSystem().cat_ranges(
888
+ lpaths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
889
+ )
890
+
891
+ def cat_ranges(
892
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
893
+ ):
894
+ logger.debug("cat ranges %s", paths)
895
+ lpaths = [self._check_file(p) for p in paths]
896
+ rpaths = [p for l, p in zip(lpaths, paths) if l is False]
897
+ lpaths = [l for l, p in zip(lpaths, paths) if l is False]
898
+ self.fs.get(rpaths, lpaths)
899
+ paths = [self._check_file(p) for p in paths]
900
+ return LocalFileSystem().cat_ranges(
901
+ paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
902
+ )
903
+
904
+ def _get_cached_file_before_open(self, path, **kwargs):
905
+ sha = self._mapper(path)
906
+ fn = os.path.join(self.storage[-1], sha)
907
+ logger.debug("Copying %s to local cache", path)
908
+
909
+ self._mkcache()
910
+ self._cache_size = None
911
+
912
+ if self.compression:
913
+ with self.fs._open(path, mode="rb", **kwargs) as f, open(fn, "wb") as f2:
914
+ if isinstance(f, AbstractBufferedFile):
915
+ # want no type of caching if just downloading whole thing
916
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
917
+ comp = (
918
+ infer_compression(path)
919
+ if self.compression == "infer"
920
+ else self.compression
921
+ )
922
+ f = compr[comp](f, mode="rb")
923
+ data = True
924
+ while data:
925
+ block = getattr(f, "blocksize", 5 * 2**20)
926
+ data = f.read(block)
927
+ f2.write(data)
928
+ else:
929
+ self.fs.get_file(path, fn)
930
+
931
+ def _open(self, path, mode="rb", **kwargs):
932
+ path = self._strip_protocol(path)
933
+ sha = self._mapper(path)
934
+
935
+ # For read (or append), (try) download from remote
936
+ if "r" in mode or "a" in mode:
937
+ if not self._check_file(path):
938
+ # append does not require an existing file but read does
939
+ if self.fs.exists(path):
940
+ self._get_cached_file_before_open(path, **kwargs)
941
+ elif "r" in mode:
942
+ raise FileNotFoundError(path)
943
+
944
+ fn = self._check_file(path)
945
+ # Just reading does not need special file handling
946
+ if "r" in mode and "+" not in mode:
947
+ return open(fn, mode)
948
+
949
+ fn = os.path.join(self.storage[-1], sha)
950
+ user_specified_kwargs = {
951
+ k: v
952
+ for k, v in kwargs.items()
953
+ if k not in ["autocommit", "block_size", "cache_options"]
954
+ } # those were added by open()
955
+ return LocalTempFile(
956
+ self,
957
+ path,
958
+ mode=mode,
959
+ autocommit=not self._intrans,
960
+ fn=fn,
961
+ **user_specified_kwargs,
962
+ )
963
+
964
+
965
+ class LocalTempFile:
966
+ """A temporary local file, which will be uploaded on commit"""
967
+
968
+ def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
969
+ self.fn = fn
970
+ self.fh = open(fn, mode)
971
+ self.mode = mode
972
+ if seek:
973
+ self.fh.seek(seek)
974
+ self.path = path
975
+ self.size = None
976
+ self.fs = fs
977
+ self.closed = False
978
+ self.autocommit = autocommit
979
+ self.kwargs = kwargs
980
+
981
+ def __reduce__(self):
982
+ # always open in r+b to allow continuing writing at a location
983
+ return (
984
+ LocalTempFile,
985
+ (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
986
+ )
987
+
988
+ def __enter__(self):
989
+ return self.fh
990
+
991
+ def __exit__(self, exc_type, exc_val, exc_tb):
992
+ self.close()
993
+
994
+ def close(self):
995
+ # self.size = self.fh.tell()
996
+ if self.closed:
997
+ return
998
+ self.fh.close()
999
+ self.closed = True
1000
+ if self.autocommit:
1001
+ self.commit()
1002
+
1003
+ def discard(self):
1004
+ self.fh.close()
1005
+ os.remove(self.fn)
1006
+
1007
+ def commit(self):
1008
+ # calling put() with list arguments avoids path expansion and additional operations
1009
+ # like isdir()
1010
+ self.fs.put([self.fn], [self.path], **self.kwargs)
1011
+ # we do not delete the local copy, it's still in the cache.
1012
+
1013
+ @property
1014
+ def name(self):
1015
+ return self.fn
1016
+
1017
+ def __repr__(self) -> str:
1018
+ return f"LocalTempFile: {self.path}"
1019
+
1020
+ def __getattr__(self, item):
1021
+ return getattr(self.fh, item)
venv/lib/python3.10/site-packages/fsspec/implementations/chained.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import ClassVar
2
+
3
+ from fsspec import AbstractFileSystem
4
+
5
+ __all__ = ("ChainedFileSystem",)
6
+
7
+
8
+ class ChainedFileSystem(AbstractFileSystem):
9
+ """Chained filesystem base class.
10
+
11
+ A chained filesystem is designed to be layered over another FS.
12
+ This is useful to implement things like caching.
13
+
14
+ This base class does very little on its own, but is used as a marker
15
+ that the class is designed for chaining.
16
+
17
+ Right now this is only used in `url_to_fs` to provide the path argument
18
+ (`fo`) to the chained filesystem from the underlying filesystem.
19
+
20
+ Additional functionality may be added in the future.
21
+ """
22
+
23
+ protocol: ClassVar[str] = "chained"
venv/lib/python3.10/site-packages/fsspec/implementations/dask.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dask
2
+ from distributed.client import Client, _get_global_client
3
+ from distributed.worker import Worker
4
+
5
+ from fsspec import filesystem
6
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
7
+ from fsspec.utils import infer_storage_options
8
+
9
+
10
+ def _get_client(client):
11
+ if client is None:
12
+ return _get_global_client()
13
+ elif isinstance(client, Client):
14
+ return client
15
+ else:
16
+ # e.g., connection string
17
+ return Client(client)
18
+
19
+
20
+ def _in_worker():
21
+ return bool(Worker._instances)
22
+
23
+
24
+ class DaskWorkerFileSystem(AbstractFileSystem):
25
+ """View files accessible to a worker as any other remote file-system
26
+
27
+ When instances are run on the worker, uses the real filesystem. When
28
+ run on the client, they call the worker to provide information or data.
29
+
30
+ **Warning** this implementation is experimental, and read-only for now.
31
+ """
32
+
33
+ def __init__(
34
+ self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
35
+ ):
36
+ super().__init__(**kwargs)
37
+ if not (fs is None) ^ (target_protocol is None):
38
+ raise ValueError(
39
+ "Please provide one of filesystem instance (fs) or"
40
+ " target_protocol, not both"
41
+ )
42
+ self.target_protocol = target_protocol
43
+ self.target_options = target_options
44
+ self.worker = None
45
+ self.client = client
46
+ self.fs = fs
47
+ self._determine_worker()
48
+
49
+ @staticmethod
50
+ def _get_kwargs_from_urls(path):
51
+ so = infer_storage_options(path)
52
+ if "host" in so and "port" in so:
53
+ return {"client": f"{so['host']}:{so['port']}"}
54
+ else:
55
+ return {}
56
+
57
+ def _determine_worker(self):
58
+ if _in_worker():
59
+ self.worker = True
60
+ if self.fs is None:
61
+ self.fs = filesystem(
62
+ self.target_protocol, **(self.target_options or {})
63
+ )
64
+ else:
65
+ self.worker = False
66
+ self.client = _get_client(self.client)
67
+ self.rfs = dask.delayed(self)
68
+
69
+ def mkdir(self, *args, **kwargs):
70
+ if self.worker:
71
+ self.fs.mkdir(*args, **kwargs)
72
+ else:
73
+ self.rfs.mkdir(*args, **kwargs).compute()
74
+
75
+ def rm(self, *args, **kwargs):
76
+ if self.worker:
77
+ self.fs.rm(*args, **kwargs)
78
+ else:
79
+ self.rfs.rm(*args, **kwargs).compute()
80
+
81
+ def copy(self, *args, **kwargs):
82
+ if self.worker:
83
+ self.fs.copy(*args, **kwargs)
84
+ else:
85
+ self.rfs.copy(*args, **kwargs).compute()
86
+
87
+ def mv(self, *args, **kwargs):
88
+ if self.worker:
89
+ self.fs.mv(*args, **kwargs)
90
+ else:
91
+ self.rfs.mv(*args, **kwargs).compute()
92
+
93
+ def ls(self, *args, **kwargs):
94
+ if self.worker:
95
+ return self.fs.ls(*args, **kwargs)
96
+ else:
97
+ return self.rfs.ls(*args, **kwargs).compute()
98
+
99
+ def _open(
100
+ self,
101
+ path,
102
+ mode="rb",
103
+ block_size=None,
104
+ autocommit=True,
105
+ cache_options=None,
106
+ **kwargs,
107
+ ):
108
+ if self.worker:
109
+ return self.fs._open(
110
+ path,
111
+ mode=mode,
112
+ block_size=block_size,
113
+ autocommit=autocommit,
114
+ cache_options=cache_options,
115
+ **kwargs,
116
+ )
117
+ else:
118
+ return DaskFile(
119
+ fs=self,
120
+ path=path,
121
+ mode=mode,
122
+ block_size=block_size,
123
+ autocommit=autocommit,
124
+ cache_options=cache_options,
125
+ **kwargs,
126
+ )
127
+
128
+ def fetch_range(self, path, mode, start, end):
129
+ if self.worker:
130
+ with self._open(path, mode) as f:
131
+ f.seek(start)
132
+ return f.read(end - start)
133
+ else:
134
+ return self.rfs.fetch_range(path, mode, start, end).compute()
135
+
136
+
137
+ class DaskFile(AbstractBufferedFile):
138
+ def __init__(self, mode="rb", **kwargs):
139
+ if mode != "rb":
140
+ raise ValueError('Remote dask files can only be opened in "rb" mode')
141
+ super().__init__(**kwargs)
142
+
143
+ def _upload_chunk(self, final=False):
144
+ pass
145
+
146
+ def _initiate_upload(self):
147
+ """Create remote file/upload"""
148
+ pass
149
+
150
+ def _fetch_range(self, start, end):
151
+ """Get the specified set of bytes from remote"""
152
+ return self.fs.fetch_range(self.path, self.mode, start, end)
venv/lib/python3.10/site-packages/fsspec/implementations/data.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ from urllib.parse import unquote
4
+
5
+ from fsspec import AbstractFileSystem
6
+
7
+
8
+ class DataFileSystem(AbstractFileSystem):
9
+ """A handy decoder for data-URLs
10
+
11
+ Example
12
+ -------
13
+ >>> with fsspec.open("data:,Hello%2C%20World%21") as f:
14
+ ... print(f.read())
15
+ b"Hello, World!"
16
+
17
+ See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
18
+ """
19
+
20
+ protocol = "data"
21
+
22
+ def __init__(self, **kwargs):
23
+ """No parameters for this filesystem"""
24
+ super().__init__(**kwargs)
25
+
26
+ def cat_file(self, path, start=None, end=None, **kwargs):
27
+ pref, data = path.split(",", 1)
28
+ if pref.endswith("base64"):
29
+ return base64.b64decode(data)[start:end]
30
+ return unquote(data).encode()[start:end]
31
+
32
+ def info(self, path, **kwargs):
33
+ pref, name = path.split(",", 1)
34
+ data = self.cat_file(path)
35
+ mime = pref.split(":", 1)[1].split(";", 1)[0]
36
+ return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
37
+
38
+ def _open(
39
+ self,
40
+ path,
41
+ mode="rb",
42
+ block_size=None,
43
+ autocommit=True,
44
+ cache_options=None,
45
+ **kwargs,
46
+ ):
47
+ if "r" not in mode:
48
+ raise ValueError("Read only filesystem")
49
+ return io.BytesIO(self.cat_file(path))
50
+
51
+ @staticmethod
52
+ def encode(data: bytes, mime: str | None = None):
53
+ """Format the given data into data-URL syntax
54
+
55
+ This version always base64 encodes, even when the data is ascii/url-safe.
56
+ """
57
+ return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
venv/lib/python3.10/site-packages/fsspec/implementations/dbfs.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import urllib
5
+
6
+ import requests
7
+ from requests.adapters import HTTPAdapter, Retry
8
+ from typing_extensions import override
9
+
10
+ from fsspec import AbstractFileSystem
11
+ from fsspec.spec import AbstractBufferedFile
12
+
13
+
14
+ class DatabricksException(Exception):
15
+ """
16
+ Helper class for exceptions raised in this module.
17
+ """
18
+
19
+ def __init__(self, error_code, message, details=None):
20
+ """Create a new DatabricksException"""
21
+ super().__init__(message)
22
+
23
+ self.error_code = error_code
24
+ self.message = message
25
+ self.details = details
26
+
27
+
28
+ class DatabricksFileSystem(AbstractFileSystem):
29
+ """
30
+ Get access to the Databricks filesystem implementation over HTTP.
31
+ Can be used inside and outside of a databricks cluster.
32
+ """
33
+
34
+ def __init__(self, instance, token, **kwargs):
35
+ """
36
+ Create a new DatabricksFileSystem.
37
+
38
+ Parameters
39
+ ----------
40
+ instance: str
41
+ The instance URL of the databricks cluster.
42
+ For example for an Azure databricks cluster, this
43
+ has the form adb-<some-number>.<two digits>.azuredatabricks.net.
44
+ token: str
45
+ Your personal token. Find out more
46
+ here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
47
+ """
48
+ self.instance = instance
49
+ self.token = token
50
+ self.session = requests.Session()
51
+ self.retries = Retry(
52
+ total=10,
53
+ backoff_factor=0.05,
54
+ status_forcelist=[408, 429, 500, 502, 503, 504],
55
+ )
56
+
57
+ self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
58
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
59
+
60
+ super().__init__(**kwargs)
61
+
62
+ @override
63
+ def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
64
+ """Check cache for listing
65
+
66
+ Returns listing, if found (may be empty list for a directory that
67
+ exists but contains nothing), None if not in cache.
68
+ """
69
+ self.dircache.pop(path.rstrip("/"), None)
70
+
71
+ parent = self._parent(path)
72
+ if parent in self.dircache:
73
+ for entry in self.dircache[parent]:
74
+ if entry["name"] == path.rstrip("/"):
75
+ if entry["type"] != "directory":
76
+ return [entry]
77
+ return []
78
+ raise FileNotFoundError(path)
79
+
80
+ def ls(self, path, detail=True, **kwargs):
81
+ """
82
+ List the contents of the given path.
83
+
84
+ Parameters
85
+ ----------
86
+ path: str
87
+ Absolute path
88
+ detail: bool
89
+ Return not only the list of filenames,
90
+ but also additional information on file sizes
91
+ and types.
92
+ """
93
+ try:
94
+ out = self._ls_from_cache(path)
95
+ except FileNotFoundError:
96
+ # This happens if the `path`'s parent was cached, but `path` is not
97
+ # there. This suggests that `path` is new since the parent was
98
+ # cached. Attempt to invalidate parent's cache before continuing.
99
+ self.dircache.pop(self._parent(path), None)
100
+ out = None
101
+
102
+ if not out:
103
+ try:
104
+ r = self._send_to_api(
105
+ method="get", endpoint="list", json={"path": path}
106
+ )
107
+ except DatabricksException as e:
108
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
109
+ raise FileNotFoundError(e.message) from e
110
+
111
+ raise
112
+ files = r.get("files", [])
113
+ out = [
114
+ {
115
+ "name": o["path"],
116
+ "type": "directory" if o["is_dir"] else "file",
117
+ "size": o["file_size"],
118
+ }
119
+ for o in files
120
+ ]
121
+ self.dircache[path] = out
122
+
123
+ if detail:
124
+ return out
125
+ return [o["name"] for o in out]
126
+
127
+ def makedirs(self, path, exist_ok=True):
128
+ """
129
+ Create a given absolute path and all of its parents.
130
+
131
+ Parameters
132
+ ----------
133
+ path: str
134
+ Absolute path to create
135
+ exist_ok: bool
136
+ If false, checks if the folder
137
+ exists before creating it (and raises an
138
+ Exception if this is the case)
139
+ """
140
+ if not exist_ok:
141
+ try:
142
+ # If the following succeeds, the path is already present
143
+ self._send_to_api(
144
+ method="get", endpoint="get-status", json={"path": path}
145
+ )
146
+ raise FileExistsError(f"Path {path} already exists")
147
+ except DatabricksException as e:
148
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
149
+ pass
150
+
151
+ try:
152
+ self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
153
+ except DatabricksException as e:
154
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
155
+ raise FileExistsError(e.message) from e
156
+
157
+ raise
158
+ self.invalidate_cache(self._parent(path))
159
+
160
+ def mkdir(self, path, create_parents=True, **kwargs):
161
+ """
162
+ Create a given absolute path and all of its parents.
163
+
164
+ Parameters
165
+ ----------
166
+ path: str
167
+ Absolute path to create
168
+ create_parents: bool
169
+ Whether to create all parents or not.
170
+ "False" is not implemented so far.
171
+ """
172
+ if not create_parents:
173
+ raise NotImplementedError
174
+
175
+ self.mkdirs(path, **kwargs)
176
+
177
+ def rm(self, path, recursive=False, **kwargs):
178
+ """
179
+ Remove the file or folder at the given absolute path.
180
+
181
+ Parameters
182
+ ----------
183
+ path: str
184
+ Absolute path what to remove
185
+ recursive: bool
186
+ Recursively delete all files in a folder.
187
+ """
188
+ try:
189
+ self._send_to_api(
190
+ method="post",
191
+ endpoint="delete",
192
+ json={"path": path, "recursive": recursive},
193
+ )
194
+ except DatabricksException as e:
195
+ # This is not really an exception, it just means
196
+ # not everything was deleted so far
197
+ if e.error_code == "PARTIAL_DELETE":
198
+ self.rm(path=path, recursive=recursive)
199
+ elif e.error_code == "IO_ERROR":
200
+ # Using the same exception as the os module would use here
201
+ raise OSError(e.message) from e
202
+
203
+ raise
204
+ self.invalidate_cache(self._parent(path))
205
+
206
+ def mv(
207
+ self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
208
+ ):
209
+ """
210
+ Move a source to a destination path.
211
+
212
+ A note from the original [databricks API manual]
213
+ (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
214
+
215
+ When moving a large number of files the API call will time out after
216
+ approximately 60s, potentially resulting in partially moved data.
217
+ Therefore, for operations that move more than 10k files, we strongly
218
+ discourage using the DBFS REST API.
219
+
220
+ Parameters
221
+ ----------
222
+ source_path: str
223
+ From where to move (absolute path)
224
+ destination_path: str
225
+ To where to move (absolute path)
226
+ recursive: bool
227
+ Not implemented to far.
228
+ maxdepth:
229
+ Not implemented to far.
230
+ """
231
+ if recursive:
232
+ raise NotImplementedError
233
+ if maxdepth:
234
+ raise NotImplementedError
235
+
236
+ try:
237
+ self._send_to_api(
238
+ method="post",
239
+ endpoint="move",
240
+ json={"source_path": source_path, "destination_path": destination_path},
241
+ )
242
+ except DatabricksException as e:
243
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
244
+ raise FileNotFoundError(e.message) from e
245
+ elif e.error_code == "RESOURCE_ALREADY_EXISTS":
246
+ raise FileExistsError(e.message) from e
247
+
248
+ raise
249
+ self.invalidate_cache(self._parent(source_path))
250
+ self.invalidate_cache(self._parent(destination_path))
251
+
252
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
253
+ """
254
+ Overwrite the base class method to make sure to create a DBFile.
255
+ All arguments are copied from the base method.
256
+
257
+ Only the default blocksize is allowed.
258
+ """
259
+ return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
260
+
261
+ def _send_to_api(self, method, endpoint, json):
262
+ """
263
+ Send the given json to the DBFS API
264
+ using a get or post request (specified by the argument `method`).
265
+
266
+ Parameters
267
+ ----------
268
+ method: str
269
+ Which http method to use for communication; "get" or "post".
270
+ endpoint: str
271
+ Where to send the request to (last part of the API URL)
272
+ json: dict
273
+ Dictionary of information to send
274
+ """
275
+ if method == "post":
276
+ session_call = self.session.post
277
+ elif method == "get":
278
+ session_call = self.session.get
279
+ else:
280
+ raise ValueError(f"Do not understand method {method}")
281
+
282
+ url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
283
+
284
+ r = session_call(url, json=json)
285
+
286
+ # The DBFS API will return a json, also in case of an exception.
287
+ # We want to preserve this information as good as possible.
288
+ try:
289
+ r.raise_for_status()
290
+ except requests.HTTPError as e:
291
+ # try to extract json error message
292
+ # if that fails, fall back to the original exception
293
+ try:
294
+ exception_json = e.response.json()
295
+ except Exception:
296
+ raise e from None
297
+
298
+ raise DatabricksException(**exception_json) from e
299
+
300
+ return r.json()
301
+
302
+ def _create_handle(self, path, overwrite=True):
303
+ """
304
+ Internal function to create a handle, which can be used to
305
+ write blocks of a file to DBFS.
306
+ A handle has a unique identifier which needs to be passed
307
+ whenever written during this transaction.
308
+ The handle is active for 10 minutes - after that a new
309
+ write transaction needs to be created.
310
+ Make sure to close the handle after you are finished.
311
+
312
+ Parameters
313
+ ----------
314
+ path: str
315
+ Absolute path for this file.
316
+ overwrite: bool
317
+ If a file already exist at this location, either overwrite
318
+ it or raise an exception.
319
+ """
320
+ try:
321
+ r = self._send_to_api(
322
+ method="post",
323
+ endpoint="create",
324
+ json={"path": path, "overwrite": overwrite},
325
+ )
326
+ return r["handle"]
327
+ except DatabricksException as e:
328
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
329
+ raise FileExistsError(e.message) from e
330
+
331
+ raise
332
+
333
+ def _close_handle(self, handle):
334
+ """
335
+ Close a handle, which was opened by :func:`_create_handle`.
336
+
337
+ Parameters
338
+ ----------
339
+ handle: str
340
+ Which handle to close.
341
+ """
342
+ try:
343
+ self._send_to_api(method="post", endpoint="close", json={"handle": handle})
344
+ except DatabricksException as e:
345
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
346
+ raise FileNotFoundError(e.message) from e
347
+
348
+ raise
349
+
350
+ def _add_data(self, handle, data):
351
+ """
352
+ Upload data to an already opened file handle
353
+ (opened by :func:`_create_handle`).
354
+ The maximal allowed data size is 1MB after
355
+ conversion to base64.
356
+ Remember to close the handle when you are finished.
357
+
358
+ Parameters
359
+ ----------
360
+ handle: str
361
+ Which handle to upload data to.
362
+ data: bytes
363
+ Block of data to add to the handle.
364
+ """
365
+ data = base64.b64encode(data).decode()
366
+ try:
367
+ self._send_to_api(
368
+ method="post",
369
+ endpoint="add-block",
370
+ json={"handle": handle, "data": data},
371
+ )
372
+ except DatabricksException as e:
373
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
374
+ raise FileNotFoundError(e.message) from e
375
+ elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
376
+ raise ValueError(e.message) from e
377
+
378
+ raise
379
+
380
+ def _get_data(self, path, start, end):
381
+ """
382
+ Download data in bytes from a given absolute path in a block
383
+ from [start, start+length].
384
+ The maximum number of allowed bytes to read is 1MB.
385
+
386
+ Parameters
387
+ ----------
388
+ path: str
389
+ Absolute path to download data from
390
+ start: int
391
+ Start position of the block
392
+ end: int
393
+ End position of the block
394
+ """
395
+ try:
396
+ r = self._send_to_api(
397
+ method="get",
398
+ endpoint="read",
399
+ json={"path": path, "offset": start, "length": end - start},
400
+ )
401
+ return base64.b64decode(r["data"])
402
+ except DatabricksException as e:
403
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
404
+ raise FileNotFoundError(e.message) from e
405
+ elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
406
+ raise ValueError(e.message) from e
407
+
408
+ raise
409
+
410
+ def invalidate_cache(self, path=None):
411
+ if path is None:
412
+ self.dircache.clear()
413
+ else:
414
+ self.dircache.pop(path, None)
415
+ super().invalidate_cache(path)
416
+
417
+
418
+ class DatabricksFile(AbstractBufferedFile):
419
+ """
420
+ Helper class for files referenced in the DatabricksFileSystem.
421
+ """
422
+
423
+ DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
424
+
425
+ def __init__(
426
+ self,
427
+ fs,
428
+ path,
429
+ mode="rb",
430
+ block_size="default",
431
+ autocommit=True,
432
+ cache_type="readahead",
433
+ cache_options=None,
434
+ **kwargs,
435
+ ):
436
+ """
437
+ Create a new instance of the DatabricksFile.
438
+
439
+ The blocksize needs to be the default one.
440
+ """
441
+ if block_size is None or block_size == "default":
442
+ block_size = self.DEFAULT_BLOCK_SIZE
443
+
444
+ assert block_size == self.DEFAULT_BLOCK_SIZE, (
445
+ f"Only the default block size is allowed, not {block_size}"
446
+ )
447
+
448
+ super().__init__(
449
+ fs,
450
+ path,
451
+ mode=mode,
452
+ block_size=block_size,
453
+ autocommit=autocommit,
454
+ cache_type=cache_type,
455
+ cache_options=cache_options or {},
456
+ **kwargs,
457
+ )
458
+
459
+ def _initiate_upload(self):
460
+ """Internal function to start a file upload"""
461
+ self.handle = self.fs._create_handle(self.path)
462
+
463
+ def _upload_chunk(self, final=False):
464
+ """Internal function to add a chunk of data to a started upload"""
465
+ self.buffer.seek(0)
466
+ data = self.buffer.getvalue()
467
+
468
+ data_chunks = [
469
+ data[start:end] for start, end in self._to_sized_blocks(len(data))
470
+ ]
471
+
472
+ for data_chunk in data_chunks:
473
+ self.fs._add_data(handle=self.handle, data=data_chunk)
474
+
475
+ if final:
476
+ self.fs._close_handle(handle=self.handle)
477
+ return True
478
+
479
+ def _fetch_range(self, start, end):
480
+ """Internal function to download a block of data"""
481
+ return_buffer = b""
482
+ length = end - start
483
+ for chunk_start, chunk_end in self._to_sized_blocks(length, start):
484
+ return_buffer += self.fs._get_data(
485
+ path=self.path, start=chunk_start, end=chunk_end
486
+ )
487
+
488
+ return return_buffer
489
+
490
+ def _to_sized_blocks(self, length, start=0):
491
+ """Helper function to split a range from 0 to total_length into blocksizes"""
492
+ end = start + length
493
+ for data_chunk in range(start, end, self.blocksize):
494
+ data_start = data_chunk
495
+ data_end = min(end, data_chunk + self.blocksize)
496
+ yield data_start, data_end
venv/lib/python3.10/site-packages/fsspec/implementations/dirfs.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import filesystem
2
+ from ..asyn import AsyncFileSystem
3
+ from .chained import ChainedFileSystem
4
+
5
+
6
+ class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
7
+ """Directory prefix filesystem
8
+
9
+ The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
10
+ is relative to the `path`. After performing the necessary paths operation it
11
+ delegates everything to the wrapped filesystem.
12
+ """
13
+
14
+ protocol = "dir"
15
+
16
+ def __init__(
17
+ self,
18
+ path=None,
19
+ fs=None,
20
+ fo=None,
21
+ target_protocol=None,
22
+ target_options=None,
23
+ **storage_options,
24
+ ):
25
+ """
26
+ Parameters
27
+ ----------
28
+ path: str
29
+ Path to the directory.
30
+ fs: AbstractFileSystem
31
+ An instantiated filesystem to wrap.
32
+ target_protocol, target_options:
33
+ if fs is none, construct it from these
34
+ fo: str
35
+ Alternate for path; do not provide both
36
+ """
37
+ super().__init__(**storage_options)
38
+ if fs is None:
39
+ fs = filesystem(protocol=target_protocol, **(target_options or {}))
40
+ path = path or fo
41
+
42
+ if self.asynchronous and not fs.async_impl:
43
+ raise ValueError("can't use asynchronous with non-async fs")
44
+
45
+ if fs.async_impl and self.asynchronous != fs.asynchronous:
46
+ raise ValueError("both dirfs and fs should be in the same sync/async mode")
47
+
48
+ self.path = fs._strip_protocol(path)
49
+ self.fs = fs
50
+
51
+ def _join(self, path):
52
+ if isinstance(path, str):
53
+ if not self.path:
54
+ return path
55
+ if not path:
56
+ return self.path
57
+ return self.fs.sep.join((self.path, self._strip_protocol(path)))
58
+ if isinstance(path, dict):
59
+ return {self._join(_path): value for _path, value in path.items()}
60
+ return [self._join(_path) for _path in path]
61
+
62
+ def _relpath(self, path):
63
+ if isinstance(path, str):
64
+ if not self.path:
65
+ return path
66
+ # We need to account for S3FileSystem returning paths that do not
67
+ # start with a '/'
68
+ if path == self.path or (
69
+ self.path.startswith(self.fs.sep) and path == self.path[1:]
70
+ ):
71
+ return ""
72
+ prefix = self.path + self.fs.sep
73
+ if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
74
+ prefix = prefix[1:]
75
+ assert path.startswith(prefix)
76
+ return path[len(prefix) :]
77
+ return [self._relpath(_path) for _path in path]
78
+
79
+ # Wrappers below
80
+
81
+ @property
82
+ def sep(self):
83
+ return self.fs.sep
84
+
85
+ async def set_session(self, *args, **kwargs):
86
+ return await self.fs.set_session(*args, **kwargs)
87
+
88
+ async def _rm_file(self, path, **kwargs):
89
+ return await self.fs._rm_file(self._join(path), **kwargs)
90
+
91
+ def rm_file(self, path, **kwargs):
92
+ return self.fs.rm_file(self._join(path), **kwargs)
93
+
94
+ async def _rm(self, path, *args, **kwargs):
95
+ return await self.fs._rm(self._join(path), *args, **kwargs)
96
+
97
+ def rm(self, path, *args, **kwargs):
98
+ return self.fs.rm(self._join(path), *args, **kwargs)
99
+
100
+ async def _cp_file(self, path1, path2, **kwargs):
101
+ return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
102
+
103
+ def cp_file(self, path1, path2, **kwargs):
104
+ return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
105
+
106
+ async def _copy(
107
+ self,
108
+ path1,
109
+ path2,
110
+ *args,
111
+ **kwargs,
112
+ ):
113
+ return await self.fs._copy(
114
+ self._join(path1),
115
+ self._join(path2),
116
+ *args,
117
+ **kwargs,
118
+ )
119
+
120
+ def copy(self, path1, path2, *args, **kwargs):
121
+ return self.fs.copy(
122
+ self._join(path1),
123
+ self._join(path2),
124
+ *args,
125
+ **kwargs,
126
+ )
127
+
128
+ async def _pipe(self, path, *args, **kwargs):
129
+ return await self.fs._pipe(self._join(path), *args, **kwargs)
130
+
131
+ def pipe(self, path, *args, **kwargs):
132
+ return self.fs.pipe(self._join(path), *args, **kwargs)
133
+
134
+ async def _pipe_file(self, path, *args, **kwargs):
135
+ return await self.fs._pipe_file(self._join(path), *args, **kwargs)
136
+
137
+ def pipe_file(self, path, *args, **kwargs):
138
+ return self.fs.pipe_file(self._join(path), *args, **kwargs)
139
+
140
+ async def _cat_file(self, path, *args, **kwargs):
141
+ return await self.fs._cat_file(self._join(path), *args, **kwargs)
142
+
143
+ def cat_file(self, path, *args, **kwargs):
144
+ return self.fs.cat_file(self._join(path), *args, **kwargs)
145
+
146
+ async def _cat(self, path, *args, **kwargs):
147
+ ret = await self.fs._cat(
148
+ self._join(path),
149
+ *args,
150
+ **kwargs,
151
+ )
152
+
153
+ if isinstance(ret, dict):
154
+ return {self._relpath(key): value for key, value in ret.items()}
155
+
156
+ return ret
157
+
158
+ def cat(self, path, *args, **kwargs):
159
+ ret = self.fs.cat(
160
+ self._join(path),
161
+ *args,
162
+ **kwargs,
163
+ )
164
+
165
+ if isinstance(ret, dict):
166
+ return {self._relpath(key): value for key, value in ret.items()}
167
+
168
+ return ret
169
+
170
+ async def _put_file(self, lpath, rpath, **kwargs):
171
+ return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
172
+
173
+ def put_file(self, lpath, rpath, **kwargs):
174
+ return self.fs.put_file(lpath, self._join(rpath), **kwargs)
175
+
176
+ async def _put(
177
+ self,
178
+ lpath,
179
+ rpath,
180
+ *args,
181
+ **kwargs,
182
+ ):
183
+ return await self.fs._put(
184
+ lpath,
185
+ self._join(rpath),
186
+ *args,
187
+ **kwargs,
188
+ )
189
+
190
+ def put(self, lpath, rpath, *args, **kwargs):
191
+ return self.fs.put(
192
+ lpath,
193
+ self._join(rpath),
194
+ *args,
195
+ **kwargs,
196
+ )
197
+
198
+ async def _get_file(self, rpath, lpath, **kwargs):
199
+ return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
200
+
201
+ def get_file(self, rpath, lpath, **kwargs):
202
+ return self.fs.get_file(self._join(rpath), lpath, **kwargs)
203
+
204
+ async def _get(self, rpath, *args, **kwargs):
205
+ return await self.fs._get(self._join(rpath), *args, **kwargs)
206
+
207
+ def get(self, rpath, *args, **kwargs):
208
+ return self.fs.get(self._join(rpath), *args, **kwargs)
209
+
210
+ async def _isfile(self, path):
211
+ return await self.fs._isfile(self._join(path))
212
+
213
+ def isfile(self, path):
214
+ return self.fs.isfile(self._join(path))
215
+
216
+ async def _isdir(self, path):
217
+ return await self.fs._isdir(self._join(path))
218
+
219
+ def isdir(self, path):
220
+ return self.fs.isdir(self._join(path))
221
+
222
+ async def _size(self, path):
223
+ return await self.fs._size(self._join(path))
224
+
225
+ def size(self, path):
226
+ return self.fs.size(self._join(path))
227
+
228
+ async def _exists(self, path):
229
+ return await self.fs._exists(self._join(path))
230
+
231
+ def exists(self, path):
232
+ return self.fs.exists(self._join(path))
233
+
234
+ async def _info(self, path, **kwargs):
235
+ info = await self.fs._info(self._join(path), **kwargs)
236
+ info = info.copy()
237
+ info["name"] = self._relpath(info["name"])
238
+ return info
239
+
240
+ def info(self, path, **kwargs):
241
+ info = self.fs.info(self._join(path), **kwargs)
242
+ info = info.copy()
243
+ info["name"] = self._relpath(info["name"])
244
+ return info
245
+
246
+ async def _ls(self, path, detail=True, **kwargs):
247
+ ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
248
+ if detail:
249
+ out = []
250
+ for entry in ret:
251
+ entry = entry.copy()
252
+ entry["name"] = self._relpath(entry["name"])
253
+ out.append(entry)
254
+ return out
255
+
256
+ return self._relpath(ret)
257
+
258
+ def ls(self, path, detail=True, **kwargs):
259
+ ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
260
+ if detail:
261
+ out = []
262
+ for entry in ret:
263
+ entry = entry.copy()
264
+ entry["name"] = self._relpath(entry["name"])
265
+ out.append(entry)
266
+ return out
267
+
268
+ return self._relpath(ret)
269
+
270
+ async def _walk(self, path, *args, **kwargs):
271
+ async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
272
+ yield self._relpath(root), dirs, files
273
+
274
+ def walk(self, path, *args, **kwargs):
275
+ for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
276
+ yield self._relpath(root), dirs, files
277
+
278
+ async def _glob(self, path, **kwargs):
279
+ detail = kwargs.get("detail", False)
280
+ ret = await self.fs._glob(self._join(path), **kwargs)
281
+ if detail:
282
+ return {self._relpath(path): info for path, info in ret.items()}
283
+ return self._relpath(ret)
284
+
285
+ def glob(self, path, **kwargs):
286
+ detail = kwargs.get("detail", False)
287
+ ret = self.fs.glob(self._join(path), **kwargs)
288
+ if detail:
289
+ return {self._relpath(path): info for path, info in ret.items()}
290
+ return self._relpath(ret)
291
+
292
+ async def _du(self, path, *args, **kwargs):
293
+ total = kwargs.get("total", True)
294
+ ret = await self.fs._du(self._join(path), *args, **kwargs)
295
+ if total:
296
+ return ret
297
+
298
+ return {self._relpath(path): size for path, size in ret.items()}
299
+
300
+ def du(self, path, *args, **kwargs):
301
+ total = kwargs.get("total", True)
302
+ ret = self.fs.du(self._join(path), *args, **kwargs)
303
+ if total:
304
+ return ret
305
+
306
+ return {self._relpath(path): size for path, size in ret.items()}
307
+
308
+ async def _find(self, path, *args, **kwargs):
309
+ detail = kwargs.get("detail", False)
310
+ ret = await self.fs._find(self._join(path), *args, **kwargs)
311
+ if detail:
312
+ return {self._relpath(path): info for path, info in ret.items()}
313
+ return self._relpath(ret)
314
+
315
+ def find(self, path, *args, **kwargs):
316
+ detail = kwargs.get("detail", False)
317
+ ret = self.fs.find(self._join(path), *args, **kwargs)
318
+ if detail:
319
+ return {self._relpath(path): info for path, info in ret.items()}
320
+ return self._relpath(ret)
321
+
322
+ async def _expand_path(self, path, *args, **kwargs):
323
+ return self._relpath(
324
+ await self.fs._expand_path(self._join(path), *args, **kwargs)
325
+ )
326
+
327
+ def expand_path(self, path, *args, **kwargs):
328
+ return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
329
+
330
+ async def _mkdir(self, path, *args, **kwargs):
331
+ return await self.fs._mkdir(self._join(path), *args, **kwargs)
332
+
333
+ def mkdir(self, path, *args, **kwargs):
334
+ return self.fs.mkdir(self._join(path), *args, **kwargs)
335
+
336
+ async def _makedirs(self, path, *args, **kwargs):
337
+ return await self.fs._makedirs(self._join(path), *args, **kwargs)
338
+
339
+ def makedirs(self, path, *args, **kwargs):
340
+ return self.fs.makedirs(self._join(path), *args, **kwargs)
341
+
342
+ def rmdir(self, path):
343
+ return self.fs.rmdir(self._join(path))
344
+
345
+ def mv(self, path1, path2, **kwargs):
346
+ return self.fs.mv(
347
+ self._join(path1),
348
+ self._join(path2),
349
+ **kwargs,
350
+ )
351
+
352
+ def touch(self, path, **kwargs):
353
+ return self.fs.touch(self._join(path), **kwargs)
354
+
355
+ def created(self, path):
356
+ return self.fs.created(self._join(path))
357
+
358
+ def modified(self, path):
359
+ return self.fs.modified(self._join(path))
360
+
361
+ def sign(self, path, *args, **kwargs):
362
+ return self.fs.sign(self._join(path), *args, **kwargs)
363
+
364
+ def __repr__(self):
365
+ return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
366
+
367
+ def open(
368
+ self,
369
+ path,
370
+ *args,
371
+ **kwargs,
372
+ ):
373
+ return self.fs.open(
374
+ self._join(path),
375
+ *args,
376
+ **kwargs,
377
+ )
378
+
379
+ async def open_async(
380
+ self,
381
+ path,
382
+ *args,
383
+ **kwargs,
384
+ ):
385
+ return await self.fs.open_async(
386
+ self._join(path),
387
+ *args,
388
+ **kwargs,
389
+ )
venv/lib/python3.10/site-packages/fsspec/implementations/ftp.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ssl
3
+ import uuid
4
+ from ftplib import FTP, FTP_TLS, Error, error_perm
5
+ from typing import Any
6
+
7
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
8
+ from ..utils import infer_storage_options, isfilelike
9
+
10
+ SECURITY_PROTOCOL_MAP = {
11
+ "tls": ssl.PROTOCOL_TLS,
12
+ "tlsv1": ssl.PROTOCOL_TLSv1,
13
+ "tlsv1_1": ssl.PROTOCOL_TLSv1_1,
14
+ "tlsv1_2": ssl.PROTOCOL_TLSv1_2,
15
+ "sslv23": ssl.PROTOCOL_SSLv23,
16
+ }
17
+
18
+
19
+ class ImplicitFTPTLS(FTP_TLS):
20
+ """
21
+ FTP_TLS subclass that automatically wraps sockets in SSL
22
+ to support implicit FTPS.
23
+ """
24
+
25
+ def __init__(self, *args, **kwargs):
26
+ super().__init__(*args, **kwargs)
27
+ self._sock = None
28
+
29
+ @property
30
+ def sock(self):
31
+ """Return the socket."""
32
+ return self._sock
33
+
34
+ @sock.setter
35
+ def sock(self, value):
36
+ """When modifying the socket, ensure that it is ssl wrapped."""
37
+ if value is not None and not isinstance(value, ssl.SSLSocket):
38
+ value = self.context.wrap_socket(value)
39
+ self._sock = value
40
+
41
+
42
+ class FTPFileSystem(AbstractFileSystem):
43
+ """A filesystem over classic FTP"""
44
+
45
+ root_marker = "/"
46
+ cachable = False
47
+ protocol = "ftp"
48
+
49
+ def __init__(
50
+ self,
51
+ host,
52
+ port=21,
53
+ username=None,
54
+ password=None,
55
+ acct=None,
56
+ block_size=None,
57
+ tempdir=None,
58
+ timeout=30,
59
+ encoding="utf-8",
60
+ tls=False,
61
+ **kwargs,
62
+ ):
63
+ """
64
+ You can use _get_kwargs_from_urls to get some kwargs from
65
+ a reasonable FTP url.
66
+
67
+ Authentication will be anonymous if username/password are not
68
+ given.
69
+
70
+ Parameters
71
+ ----------
72
+ host: str
73
+ The remote server name/ip to connect to
74
+ port: int
75
+ Port to connect with
76
+ username: str or None
77
+ If authenticating, the user's identifier
78
+ password: str of None
79
+ User's password on the server, if using
80
+ acct: str or None
81
+ Some servers also need an "account" string for auth
82
+ block_size: int or None
83
+ If given, the read-ahead or write buffer size.
84
+ tempdir: str
85
+ Directory on remote to put temporary files when in a transaction
86
+ timeout: int
87
+ Timeout of the ftp connection in seconds
88
+ encoding: str
89
+ Encoding to use for directories and filenames in FTP connection
90
+ tls: bool or str
91
+ Enable FTP-TLS for secure connections:
92
+ - False: Plain FTP (default)
93
+ - True: Explicit TLS (FTPS with AUTH TLS command)
94
+ - "tls": Auto-negotiate highest protocol
95
+ - "tlsv1": TLS v1.0
96
+ - "tlsv1_1": TLS v1.1
97
+ - "tlsv1_2": TLS v1.2
98
+ """
99
+ super().__init__(**kwargs)
100
+ self.host = host
101
+ self.port = port
102
+ self.tempdir = tempdir or "/tmp"
103
+ self.cred = username or "", password or "", acct or ""
104
+ self.timeout = timeout
105
+ self.encoding = encoding
106
+ if block_size is not None:
107
+ self.blocksize = block_size
108
+ else:
109
+ self.blocksize = 2**16
110
+ self.tls = tls
111
+ self._connect()
112
+ if isinstance(self.tls, bool) and self.tls:
113
+ self.ftp.prot_p()
114
+
115
+ def _connect(self):
116
+ security = None
117
+ if self.tls:
118
+ if isinstance(self.tls, str):
119
+ ftp_cls = ImplicitFTPTLS
120
+ security = SECURITY_PROTOCOL_MAP.get(
121
+ self.tls,
122
+ f"Not supported {self.tls} protocol",
123
+ )
124
+ if isinstance(security, str):
125
+ raise ValueError(security)
126
+ else:
127
+ ftp_cls = FTP_TLS
128
+ else:
129
+ ftp_cls = FTP
130
+ self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
131
+ if security:
132
+ self.ftp.ssl_version = security
133
+ self.ftp.connect(self.host, self.port)
134
+ self.ftp.login(*self.cred)
135
+
136
+ @classmethod
137
+ def _strip_protocol(cls, path):
138
+ return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
139
+
140
+ @staticmethod
141
+ def _get_kwargs_from_urls(urlpath):
142
+ out = infer_storage_options(urlpath)
143
+ out.pop("path", None)
144
+ out.pop("protocol", None)
145
+ return out
146
+
147
+ def ls(self, path, detail=True, **kwargs):
148
+ path = self._strip_protocol(path)
149
+ out = []
150
+ if path not in self.dircache:
151
+ try:
152
+ try:
153
+ out = [
154
+ (fn, details)
155
+ for (fn, details) in self.ftp.mlsd(path)
156
+ if fn not in [".", ".."]
157
+ and details["type"] not in ["pdir", "cdir"]
158
+ ]
159
+ except error_perm:
160
+ out = _mlsd2(self.ftp, path) # Not platform independent
161
+ for fn, details in out:
162
+ details["name"] = "/".join(
163
+ ["" if path == "/" else path, fn.lstrip("/")]
164
+ )
165
+ if details["type"] == "file":
166
+ details["size"] = int(details["size"])
167
+ else:
168
+ details["size"] = 0
169
+ if details["type"] == "dir":
170
+ details["type"] = "directory"
171
+ self.dircache[path] = out
172
+ except Error:
173
+ try:
174
+ info = self.info(path)
175
+ if info["type"] == "file":
176
+ out = [(path, info)]
177
+ except (Error, IndexError) as exc:
178
+ raise FileNotFoundError(path) from exc
179
+ files = self.dircache.get(path, out)
180
+ if not detail:
181
+ return sorted([fn for fn, details in files])
182
+ return [details for fn, details in files]
183
+
184
+ def info(self, path, **kwargs):
185
+ # implement with direct method
186
+ path = self._strip_protocol(path)
187
+ if path == "/":
188
+ # special case, since this dir has no real entry
189
+ return {"name": "/", "size": 0, "type": "directory"}
190
+ files = self.ls(self._parent(path).lstrip("/"), True)
191
+ try:
192
+ out = next(f for f in files if f["name"] == path)
193
+ except StopIteration as exc:
194
+ raise FileNotFoundError(path) from exc
195
+ return out
196
+
197
+ def get_file(self, rpath, lpath, **kwargs):
198
+ if self.isdir(rpath):
199
+ if not os.path.exists(lpath):
200
+ os.mkdir(lpath)
201
+ return
202
+ if isfilelike(lpath):
203
+ outfile = lpath
204
+ else:
205
+ outfile = open(lpath, "wb")
206
+
207
+ def cb(x):
208
+ outfile.write(x)
209
+
210
+ self.ftp.retrbinary(
211
+ f"RETR {rpath}",
212
+ blocksize=self.blocksize,
213
+ callback=cb,
214
+ )
215
+ if not isfilelike(lpath):
216
+ outfile.close()
217
+
218
+ def cat_file(self, path, start=None, end=None, **kwargs):
219
+ if end is not None:
220
+ return super().cat_file(path, start, end, **kwargs)
221
+ out = []
222
+
223
+ def cb(x):
224
+ out.append(x)
225
+
226
+ try:
227
+ self.ftp.retrbinary(
228
+ f"RETR {path}",
229
+ blocksize=self.blocksize,
230
+ rest=start,
231
+ callback=cb,
232
+ )
233
+ except (Error, error_perm) as orig_exc:
234
+ raise FileNotFoundError(path) from orig_exc
235
+ return b"".join(out)
236
+
237
+ def _open(
238
+ self,
239
+ path,
240
+ mode="rb",
241
+ block_size=None,
242
+ cache_options=None,
243
+ autocommit=True,
244
+ **kwargs,
245
+ ):
246
+ path = self._strip_protocol(path)
247
+ block_size = block_size or self.blocksize
248
+ return FTPFile(
249
+ self,
250
+ path,
251
+ mode=mode,
252
+ block_size=block_size,
253
+ tempdir=self.tempdir,
254
+ autocommit=autocommit,
255
+ cache_options=cache_options,
256
+ )
257
+
258
+ def _rm(self, path):
259
+ path = self._strip_protocol(path)
260
+ self.ftp.delete(path)
261
+ self.invalidate_cache(self._parent(path))
262
+
263
+ def rm(self, path, recursive=False, maxdepth=None):
264
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
265
+ for p in reversed(paths):
266
+ if self.isfile(p):
267
+ self.rm_file(p)
268
+ else:
269
+ self.rmdir(p)
270
+
271
+ def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
272
+ path = self._strip_protocol(path)
273
+ parent = self._parent(path)
274
+ if parent != self.root_marker and not self.exists(parent) and create_parents:
275
+ self.mkdir(parent, create_parents=create_parents)
276
+
277
+ self.ftp.mkd(path)
278
+ self.invalidate_cache(self._parent(path))
279
+
280
+ def makedirs(self, path: str, exist_ok: bool = False) -> None:
281
+ path = self._strip_protocol(path)
282
+ if self.exists(path):
283
+ # NB: "/" does not "exist" as it has no directory entry
284
+ if not exist_ok:
285
+ raise FileExistsError(f"{path} exists without `exist_ok`")
286
+ # exists_ok=True -> no-op
287
+ else:
288
+ self.mkdir(path, create_parents=True)
289
+
290
+ def rmdir(self, path):
291
+ path = self._strip_protocol(path)
292
+ self.ftp.rmd(path)
293
+ self.invalidate_cache(self._parent(path))
294
+
295
+ def mv(self, path1, path2, **kwargs):
296
+ path1 = self._strip_protocol(path1)
297
+ path2 = self._strip_protocol(path2)
298
+ self.ftp.rename(path1, path2)
299
+ self.invalidate_cache(self._parent(path1))
300
+ self.invalidate_cache(self._parent(path2))
301
+
302
+ def __del__(self):
303
+ self.ftp.close()
304
+
305
+ def invalidate_cache(self, path=None):
306
+ if path is None:
307
+ self.dircache.clear()
308
+ else:
309
+ self.dircache.pop(path, None)
310
+ super().invalidate_cache(path)
311
+
312
+
313
+ class TransferDone(Exception):
314
+ """Internal exception to break out of transfer"""
315
+
316
+ pass
317
+
318
+
319
+ class FTPFile(AbstractBufferedFile):
320
+ """Interact with a remote FTP file with read/write buffering"""
321
+
322
+ def __init__(
323
+ self,
324
+ fs,
325
+ path,
326
+ mode="rb",
327
+ block_size="default",
328
+ autocommit=True,
329
+ cache_type="readahead",
330
+ cache_options=None,
331
+ **kwargs,
332
+ ):
333
+ super().__init__(
334
+ fs,
335
+ path,
336
+ mode=mode,
337
+ block_size=block_size,
338
+ autocommit=autocommit,
339
+ cache_type=cache_type,
340
+ cache_options=cache_options,
341
+ **kwargs,
342
+ )
343
+ if not autocommit:
344
+ self.target = self.path
345
+ self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
346
+
347
+ def commit(self):
348
+ self.fs.mv(self.path, self.target)
349
+
350
+ def discard(self):
351
+ self.fs.rm(self.path)
352
+
353
+ def _fetch_range(self, start, end):
354
+ """Get bytes between given byte limits
355
+
356
+ Implemented by raising an exception in the fetch callback when the
357
+ number of bytes received reaches the requested amount.
358
+
359
+ Will fail if the server does not respect the REST command on
360
+ retrieve requests.
361
+ """
362
+ out = []
363
+ total = [0]
364
+
365
+ def callback(x):
366
+ total[0] += len(x)
367
+ if total[0] > end - start:
368
+ out.append(x[: (end - start) - total[0]])
369
+ if end < self.size:
370
+ raise TransferDone
371
+ else:
372
+ out.append(x)
373
+
374
+ if total[0] == end - start and end < self.size:
375
+ raise TransferDone
376
+
377
+ try:
378
+ self.fs.ftp.retrbinary(
379
+ f"RETR {self.path}",
380
+ blocksize=self.blocksize,
381
+ rest=start,
382
+ callback=callback,
383
+ )
384
+ except TransferDone:
385
+ try:
386
+ # stop transfer, we got enough bytes for this block
387
+ self.fs.ftp.abort()
388
+ self.fs.ftp.getmultiline()
389
+ except Error:
390
+ self.fs._connect()
391
+
392
+ return b"".join(out)
393
+
394
+ def _upload_chunk(self, final=False):
395
+ self.buffer.seek(0)
396
+ self.fs.ftp.storbinary(
397
+ f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
398
+ )
399
+ return True
400
+
401
+
402
+ def _mlsd2(ftp, path="."):
403
+ """
404
+ Fall back to using `dir` instead of `mlsd` if not supported.
405
+
406
+ This parses a Linux style `ls -l` response to `dir`, but the response may
407
+ be platform dependent.
408
+
409
+ Parameters
410
+ ----------
411
+ ftp: ftplib.FTP
412
+ path: str
413
+ Expects to be given path, but defaults to ".".
414
+ """
415
+ lines = []
416
+ minfo = []
417
+ ftp.dir(path, lines.append)
418
+ for line in lines:
419
+ split_line = line.split()
420
+ if len(split_line) < 9:
421
+ continue
422
+ this = (
423
+ split_line[-1],
424
+ {
425
+ "modify": " ".join(split_line[5:8]),
426
+ "unix.owner": split_line[2],
427
+ "unix.group": split_line[3],
428
+ "unix.mode": split_line[0],
429
+ "size": split_line[4],
430
+ },
431
+ )
432
+ if this[1]["unix.mode"][0] == "d":
433
+ this[1]["type"] = "dir"
434
+ else:
435
+ this[1]["type"] = "file"
436
+ minfo.append(this)
437
+ return minfo
venv/lib/python3.10/site-packages/fsspec/implementations/gist.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ from ..spec import AbstractFileSystem
4
+ from ..utils import infer_storage_options
5
+ from .memory import MemoryFile
6
+
7
+
8
+ class GistFileSystem(AbstractFileSystem):
9
+ """
10
+ Interface to files in a single GitHub Gist.
11
+
12
+ Provides read-only access to a gist's files. Gists do not contain
13
+ subdirectories, so file listing is straightforward.
14
+
15
+ Parameters
16
+ ----------
17
+ gist_id: str
18
+ The ID of the gist you want to access (the long hex value from the URL).
19
+ filenames: list[str] (optional)
20
+ If provided, only make a file system representing these files, and do not fetch
21
+ the list of all files for this gist.
22
+ sha: str (optional)
23
+ If provided, fetch a particular revision of the gist. If omitted,
24
+ the latest revision is used.
25
+ username: str (optional)
26
+ GitHub username for authentication.
27
+ token: str (optional)
28
+ GitHub personal access token (required if username is given), or.
29
+ timeout: (float, float) or float, optional
30
+ Connect and read timeouts for requests (default 60s each).
31
+ kwargs: dict
32
+ Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
33
+ metadata or reading ("opening") a file.
34
+ """
35
+
36
+ protocol = "gist"
37
+ gist_url = "https://api.github.com/gists/{gist_id}"
38
+ gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
39
+
40
+ def __init__(
41
+ self,
42
+ gist_id,
43
+ filenames=None,
44
+ sha=None,
45
+ username=None,
46
+ token=None,
47
+ timeout=None,
48
+ **kwargs,
49
+ ):
50
+ super().__init__()
51
+ self.gist_id = gist_id
52
+ self.filenames = filenames
53
+ self.sha = sha # revision of the gist (optional)
54
+ if username is not None and token is None:
55
+ raise ValueError("User auth requires a token")
56
+ self.username = username
57
+ self.token = token
58
+ self.request_kw = kwargs
59
+ # Default timeouts to 60s connect/read if none provided
60
+ self.timeout = timeout if timeout is not None else (60, 60)
61
+
62
+ # We use a single-level "directory" cache, because a gist is essentially flat
63
+ self.dircache[""] = self._fetch_file_list()
64
+
65
+ @property
66
+ def kw(self):
67
+ """Auth parameters passed to 'requests' if we have username/token."""
68
+ kw = {
69
+ "headers": {
70
+ "Accept": "application/vnd.github+json",
71
+ "X-GitHub-Api-Version": "2022-11-28",
72
+ }
73
+ }
74
+ kw.update(self.request_kw)
75
+ if self.username and self.token:
76
+ kw["auth"] = (self.username, self.token)
77
+ elif self.token:
78
+ kw["headers"]["Authorization"] = f"Bearer {self.token}"
79
+ return kw
80
+
81
+ def _fetch_gist_metadata(self):
82
+ """
83
+ Fetch the JSON metadata for this gist (possibly for a specific revision).
84
+ """
85
+ if self.sha:
86
+ url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
87
+ else:
88
+ url = self.gist_url.format(gist_id=self.gist_id)
89
+
90
+ r = requests.get(url, timeout=self.timeout, **self.kw)
91
+ if r.status_code == 404:
92
+ raise FileNotFoundError(
93
+ f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
94
+ )
95
+ r.raise_for_status()
96
+ return r.json()
97
+
98
+ def _fetch_file_list(self):
99
+ """
100
+ Returns a list of dicts describing each file in the gist. These get stored
101
+ in self.dircache[""].
102
+ """
103
+ meta = self._fetch_gist_metadata()
104
+ if self.filenames:
105
+ available_files = meta.get("files", {})
106
+ files = {}
107
+ for fn in self.filenames:
108
+ if fn not in available_files:
109
+ raise FileNotFoundError(fn)
110
+ files[fn] = available_files[fn]
111
+ else:
112
+ files = meta.get("files", {})
113
+
114
+ out = []
115
+ for fname, finfo in files.items():
116
+ if finfo is None:
117
+ # Occasionally GitHub returns a file entry with null if it was deleted
118
+ continue
119
+ # Build a directory entry
120
+ out.append(
121
+ {
122
+ "name": fname, # file's name
123
+ "type": "file", # gists have no subdirectories
124
+ "size": finfo.get("size", 0), # file size in bytes
125
+ "raw_url": finfo.get("raw_url"),
126
+ }
127
+ )
128
+ return out
129
+
130
+ @classmethod
131
+ def _strip_protocol(cls, path):
132
+ """
133
+ Remove 'gist://' from the path, if present.
134
+ """
135
+ # The default infer_storage_options can handle gist://username:token@id/file
136
+ # or gist://id/file, but let's ensure we handle a normal usage too.
137
+ # We'll just strip the protocol prefix if it exists.
138
+ path = infer_storage_options(path).get("path", path)
139
+ return path.lstrip("/")
140
+
141
+ @staticmethod
142
+ def _get_kwargs_from_urls(path):
143
+ """
144
+ Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
145
+ For example:
146
+ gist://:TOKEN@<gist_id>/file.txt
147
+ gist://username:TOKEN@<gist_id>/file.txt
148
+ """
149
+ so = infer_storage_options(path)
150
+ out = {}
151
+ if "username" in so and so["username"]:
152
+ out["username"] = so["username"]
153
+ if "password" in so and so["password"]:
154
+ out["token"] = so["password"]
155
+ if "host" in so and so["host"]:
156
+ # We interpret 'host' as the gist ID
157
+ out["gist_id"] = so["host"]
158
+
159
+ # Extract SHA and filename from path
160
+ if "path" in so and so["path"]:
161
+ path_parts = so["path"].rsplit("/", 2)[-2:]
162
+ if len(path_parts) == 2:
163
+ if path_parts[0]: # SHA present
164
+ out["sha"] = path_parts[0]
165
+ if path_parts[1]: # filename also present
166
+ out["filenames"] = [path_parts[1]]
167
+
168
+ return out
169
+
170
+ def ls(self, path="", detail=False, **kwargs):
171
+ """
172
+ List files in the gist. Gists are single-level, so any 'path' is basically
173
+ the filename, or empty for all files.
174
+
175
+ Parameters
176
+ ----------
177
+ path : str, optional
178
+ The filename to list. If empty, returns all files in the gist.
179
+ detail : bool, default False
180
+ If True, return a list of dicts; if False, return a list of filenames.
181
+ """
182
+ path = self._strip_protocol(path or "")
183
+ # If path is empty, return all
184
+ if path == "":
185
+ results = self.dircache[""]
186
+ else:
187
+ # We want just the single file with this name
188
+ all_files = self.dircache[""]
189
+ results = [f for f in all_files if f["name"] == path]
190
+ if not results:
191
+ raise FileNotFoundError(path)
192
+ if detail:
193
+ return results
194
+ else:
195
+ return sorted(f["name"] for f in results)
196
+
197
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
198
+ """
199
+ Read a single file from the gist.
200
+ """
201
+ if mode != "rb":
202
+ raise NotImplementedError("GitHub Gist FS is read-only (no write).")
203
+
204
+ path = self._strip_protocol(path)
205
+ # Find the file entry in our dircache
206
+ matches = [f for f in self.dircache[""] if f["name"] == path]
207
+ if not matches:
208
+ raise FileNotFoundError(path)
209
+ finfo = matches[0]
210
+
211
+ raw_url = finfo.get("raw_url")
212
+ if not raw_url:
213
+ raise FileNotFoundError(f"No raw_url for file: {path}")
214
+
215
+ r = requests.get(raw_url, timeout=self.timeout, **self.kw)
216
+ if r.status_code == 404:
217
+ raise FileNotFoundError(path)
218
+ r.raise_for_status()
219
+ return MemoryFile(path, None, r.content)
220
+
221
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
222
+ """
223
+ Return {path: contents} for the given file or files. If 'recursive' is True,
224
+ and path is empty, returns all files in the gist.
225
+ """
226
+ paths = self.expand_path(path, recursive=recursive)
227
+ out = {}
228
+ for p in paths:
229
+ try:
230
+ with self.open(p, "rb") as f:
231
+ out[p] = f.read()
232
+ except FileNotFoundError as e:
233
+ if on_error == "raise":
234
+ raise e
235
+ elif on_error == "omit":
236
+ pass # skip
237
+ else:
238
+ out[p] = e
239
+ if len(paths) == 1 and paths[0] == path:
240
+ return out[path]
241
+ return out
venv/lib/python3.10/site-packages/fsspec/implementations/git.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pygit2
4
+
5
+ from fsspec.spec import AbstractFileSystem
6
+
7
+ from .memory import MemoryFile
8
+
9
+
10
+ class GitFileSystem(AbstractFileSystem):
11
+ """Browse the files of a local git repo at any hash/tag/branch
12
+
13
+ (experimental backend)
14
+ """
15
+
16
+ root_marker = ""
17
+ cachable = True
18
+
19
+ def __init__(self, path=None, fo=None, ref=None, **kwargs):
20
+ """
21
+
22
+ Parameters
23
+ ----------
24
+ path: str (optional)
25
+ Local location of the repo (uses current directory if not given).
26
+ May be deprecated in favour of ``fo``. When used with a higher
27
+ level function such as fsspec.open(), may be of the form
28
+ "git://[path-to-repo[:]][ref@]path/to/file" (but the actual
29
+ file path should not contain "@" or ":").
30
+ fo: str (optional)
31
+ Same as ``path``, but passed as part of a chained URL. This one
32
+ takes precedence if both are given.
33
+ ref: str (optional)
34
+ Reference to work with, could be a hash, tag or branch name. Defaults
35
+ to current working tree. Note that ``ls`` and ``open`` also take hash,
36
+ so this becomes the default for those operations
37
+ kwargs
38
+ """
39
+ super().__init__(**kwargs)
40
+ self.repo = pygit2.Repository(fo or path or os.getcwd())
41
+ self.ref = ref or "master"
42
+
43
+ @classmethod
44
+ def _strip_protocol(cls, path):
45
+ path = super()._strip_protocol(path).lstrip("/")
46
+ if ":" in path:
47
+ path = path.split(":", 1)[1]
48
+ if "@" in path:
49
+ path = path.split("@", 1)[1]
50
+ return path.lstrip("/")
51
+
52
+ def _path_to_object(self, path, ref):
53
+ comm, ref = self.repo.resolve_refish(ref or self.ref)
54
+ parts = path.split("/")
55
+ tree = comm.tree
56
+ for part in parts:
57
+ if part and isinstance(tree, pygit2.Tree):
58
+ if part not in tree:
59
+ raise FileNotFoundError(path)
60
+ tree = tree[part]
61
+ return tree
62
+
63
+ @staticmethod
64
+ def _get_kwargs_from_urls(path):
65
+ path = path.removeprefix("git://")
66
+ out = {}
67
+ if ":" in path:
68
+ out["path"], path = path.split(":", 1)
69
+ if "@" in path:
70
+ out["ref"], path = path.split("@", 1)
71
+ return out
72
+
73
+ @staticmethod
74
+ def _object_to_info(obj, path=None):
75
+ # obj.name and obj.filemode are None for the root tree!
76
+ is_dir = isinstance(obj, pygit2.Tree)
77
+ return {
78
+ "type": "directory" if is_dir else "file",
79
+ "name": (
80
+ "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
81
+ ),
82
+ "hex": str(obj.id),
83
+ "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
84
+ "size": 0 if is_dir else obj.size,
85
+ }
86
+
87
+ def ls(self, path, detail=True, ref=None, **kwargs):
88
+ tree = self._path_to_object(self._strip_protocol(path), ref)
89
+ return [
90
+ GitFileSystem._object_to_info(obj, path)
91
+ if detail
92
+ else GitFileSystem._object_to_info(obj, path)["name"]
93
+ for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
94
+ ]
95
+
96
+ def info(self, path, ref=None, **kwargs):
97
+ tree = self._path_to_object(self._strip_protocol(path), ref)
98
+ return GitFileSystem._object_to_info(tree, path)
99
+
100
+ def ukey(self, path, ref=None):
101
+ return self.info(path, ref=ref)["hex"]
102
+
103
+ def _open(
104
+ self,
105
+ path,
106
+ mode="rb",
107
+ block_size=None,
108
+ autocommit=True,
109
+ cache_options=None,
110
+ ref=None,
111
+ **kwargs,
112
+ ):
113
+ obj = self._path_to_object(path, ref or self.ref)
114
+ return MemoryFile(data=obj.data)
venv/lib/python3.10/site-packages/fsspec/implementations/github.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import re
3
+
4
+ import requests
5
+
6
+ from ..spec import AbstractFileSystem
7
+ from ..utils import infer_storage_options
8
+ from .memory import MemoryFile
9
+
10
+
11
+ class GithubFileSystem(AbstractFileSystem):
12
+ """Interface to files in github
13
+
14
+ An instance of this class provides the files residing within a remote github
15
+ repository. You may specify a point in the repos history, by SHA, branch
16
+ or tag (default is current master).
17
+
18
+ For files less than 1 MB in size, file content is returned directly in a
19
+ MemoryFile. For larger files, or for files tracked by git-lfs, file content
20
+ is returned as an HTTPFile wrapping the ``download_url`` provided by the
21
+ GitHub API.
22
+
23
+ When using fsspec.open, allows URIs of the form:
24
+
25
+ - "github://path/file", in which case you must specify org, repo and
26
+ may specify sha in the extra args
27
+ - 'github://org:repo@/precip/catalog.yml', where the org and repo are
28
+ part of the URI
29
+ - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
30
+
31
+ ``sha`` can be the full or abbreviated hex of the commit you want to fetch
32
+ from, or a branch or tag name (so long as it doesn't contain special characters
33
+ like "/", "?", which would have to be HTTP-encoded).
34
+
35
+ For authorised access, you must provide username and token, which can be made
36
+ at https://github.com/settings/tokens
37
+ """
38
+
39
+ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
40
+ content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
41
+ protocol = "github"
42
+ timeout = (60, 60) # connect, read timeouts
43
+
44
+ def __init__(
45
+ self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
46
+ ):
47
+ super().__init__(**kwargs)
48
+ self.org = org
49
+ self.repo = repo
50
+ if (username is None) ^ (token is None):
51
+ raise ValueError("Auth required both username and token")
52
+ self.username = username
53
+ self.token = token
54
+ if timeout is not None:
55
+ self.timeout = timeout
56
+ if sha is None:
57
+ # look up default branch (not necessarily "master")
58
+ u = "https://api.github.com/repos/{org}/{repo}"
59
+ r = requests.get(
60
+ u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
61
+ )
62
+ r.raise_for_status()
63
+ sha = r.json()["default_branch"]
64
+
65
+ self.root = sha
66
+ self.ls("")
67
+ try:
68
+ from .http import HTTPFileSystem
69
+
70
+ self.http_fs = HTTPFileSystem(**kwargs)
71
+ except ImportError:
72
+ self.http_fs = None
73
+
74
+ @property
75
+ def kw(self):
76
+ if self.username:
77
+ return {"auth": (self.username, self.token)}
78
+ return {}
79
+
80
+ @classmethod
81
+ def repos(cls, org_or_user, is_org=True):
82
+ """List repo names for given org or user
83
+
84
+ This may become the top level of the FS
85
+
86
+ Parameters
87
+ ----------
88
+ org_or_user: str
89
+ Name of the github org or user to query
90
+ is_org: bool (default True)
91
+ Whether the name is an organisation (True) or user (False)
92
+
93
+ Returns
94
+ -------
95
+ List of string
96
+ """
97
+ r = requests.get(
98
+ f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
99
+ timeout=cls.timeout,
100
+ )
101
+ r.raise_for_status()
102
+ return [repo["name"] for repo in r.json()]
103
+
104
+ @property
105
+ def tags(self):
106
+ """Names of tags in the repo"""
107
+ r = requests.get(
108
+ f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
109
+ timeout=self.timeout,
110
+ **self.kw,
111
+ )
112
+ r.raise_for_status()
113
+ return [t["name"] for t in r.json()]
114
+
115
+ @property
116
+ def branches(self):
117
+ """Names of branches in the repo"""
118
+ r = requests.get(
119
+ f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
120
+ timeout=self.timeout,
121
+ **self.kw,
122
+ )
123
+ r.raise_for_status()
124
+ return [t["name"] for t in r.json()]
125
+
126
+ @property
127
+ def refs(self):
128
+ """Named references, tags and branches"""
129
+ return {"tags": self.tags, "branches": self.branches}
130
+
131
+ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
132
+ """List files at given path
133
+
134
+ Parameters
135
+ ----------
136
+ path: str
137
+ Location to list, relative to repo root
138
+ detail: bool
139
+ If True, returns list of dicts, one per file; if False, returns
140
+ list of full filenames only
141
+ sha: str (optional)
142
+ List at the given point in the repo history, branch or tag name or commit
143
+ SHA
144
+ _sha: str (optional)
145
+ List this specific tree object (used internally to descend into trees)
146
+ """
147
+ path = self._strip_protocol(path)
148
+ if path == "":
149
+ _sha = sha or self.root
150
+ if _sha is None:
151
+ parts = path.rstrip("/").split("/")
152
+ so_far = ""
153
+ _sha = sha or self.root
154
+ for part in parts:
155
+ out = self.ls(so_far, True, sha=sha, _sha=_sha)
156
+ so_far += "/" + part if so_far else part
157
+ out = [o for o in out if o["name"] == so_far]
158
+ if not out:
159
+ raise FileNotFoundError(path)
160
+ out = out[0]
161
+ if out["type"] == "file":
162
+ if detail:
163
+ return [out]
164
+ else:
165
+ return path
166
+ _sha = out["sha"]
167
+ if path not in self.dircache or sha not in [self.root, None]:
168
+ r = requests.get(
169
+ self.url.format(org=self.org, repo=self.repo, sha=_sha),
170
+ timeout=self.timeout,
171
+ **self.kw,
172
+ )
173
+ if r.status_code == 404:
174
+ raise FileNotFoundError(path)
175
+ r.raise_for_status()
176
+ types = {"blob": "file", "tree": "directory"}
177
+ out = [
178
+ {
179
+ "name": path + "/" + f["path"] if path else f["path"],
180
+ "mode": f["mode"],
181
+ "type": types[f["type"]],
182
+ "size": f.get("size", 0),
183
+ "sha": f["sha"],
184
+ }
185
+ for f in r.json()["tree"]
186
+ if f["type"] in types
187
+ ]
188
+ if sha in [self.root, None]:
189
+ self.dircache[path] = out
190
+ else:
191
+ out = self.dircache[path]
192
+ if detail:
193
+ return out
194
+ else:
195
+ return sorted([f["name"] for f in out])
196
+
197
+ def invalidate_cache(self, path=None):
198
+ self.dircache.clear()
199
+
200
+ @classmethod
201
+ def _strip_protocol(cls, path):
202
+ opts = infer_storage_options(path)
203
+ if "username" not in opts:
204
+ return super()._strip_protocol(path)
205
+ return opts["path"].lstrip("/")
206
+
207
+ @staticmethod
208
+ def _get_kwargs_from_urls(path):
209
+ opts = infer_storage_options(path)
210
+ if "username" not in opts:
211
+ return {}
212
+ out = {"org": opts["username"], "repo": opts["password"]}
213
+ if opts["host"]:
214
+ out["sha"] = opts["host"]
215
+ return out
216
+
217
+ def _open(
218
+ self,
219
+ path,
220
+ mode="rb",
221
+ block_size=None,
222
+ cache_options=None,
223
+ sha=None,
224
+ **kwargs,
225
+ ):
226
+ if mode != "rb":
227
+ raise NotImplementedError
228
+
229
+ # construct a url to hit the GitHub API's repo contents API
230
+ url = self.content_url.format(
231
+ org=self.org, repo=self.repo, path=path, sha=sha or self.root
232
+ )
233
+
234
+ # make a request to this API, and parse the response as JSON
235
+ r = requests.get(url, timeout=self.timeout, **self.kw)
236
+ if r.status_code == 404:
237
+ raise FileNotFoundError(path)
238
+ r.raise_for_status()
239
+ content_json = r.json()
240
+
241
+ # if the response's content key is not empty, try to parse it as base64
242
+ if content_json["content"]:
243
+ content = base64.b64decode(content_json["content"])
244
+
245
+ # as long as the content does not start with the string
246
+ # "version https://git-lfs.github.com/"
247
+ # then it is probably not a git-lfs pointer and we can just return
248
+ # the content directly
249
+ if not content.startswith(b"version https://git-lfs.github.com/"):
250
+ return MemoryFile(None, None, content)
251
+
252
+ # we land here if the content was not present in the first response
253
+ # (regular file over 1MB or git-lfs tracked file)
254
+ # in this case, we get let the HTTPFileSystem handle the download
255
+ if self.http_fs is None:
256
+ raise ImportError(
257
+ "Please install fsspec[http] to access github files >1 MB "
258
+ "or git-lfs tracked files."
259
+ )
260
+ return self.http_fs.open(
261
+ content_json["download_url"],
262
+ mode=mode,
263
+ block_size=block_size,
264
+ cache_options=cache_options,
265
+ **kwargs,
266
+ )
267
+
268
+ def rm(self, path, recursive=False, maxdepth=None, message=None):
269
+ path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
270
+ for p in reversed(path):
271
+ self.rm_file(p, message=message)
272
+
273
+ def rm_file(self, path, message=None, **kwargs):
274
+ """
275
+ Remove a file from a specified branch using a given commit message.
276
+
277
+ Since Github DELETE operation requires a branch name, and we can't reliably
278
+ determine whether the provided SHA refers to a branch, tag, or commit, we
279
+ assume it's a branch. If it's not, the user will encounter an error when
280
+ attempting to retrieve the file SHA or delete the file.
281
+
282
+ Parameters
283
+ ----------
284
+ path: str
285
+ The file's location relative to the repository root.
286
+ message: str, optional
287
+ The commit message for the deletion.
288
+ """
289
+
290
+ if not self.username:
291
+ raise ValueError("Authentication required")
292
+
293
+ path = self._strip_protocol(path)
294
+
295
+ # Attempt to get SHA from cache or Github API
296
+ sha = self._get_sha_from_cache(path)
297
+ if not sha:
298
+ url = self.content_url.format(
299
+ org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
300
+ )
301
+ r = requests.get(url, timeout=self.timeout, **self.kw)
302
+ if r.status_code == 404:
303
+ raise FileNotFoundError(path)
304
+ r.raise_for_status()
305
+ sha = r.json()["sha"]
306
+
307
+ # Delete the file
308
+ delete_url = self.content_url.format(
309
+ org=self.org, repo=self.repo, path=path, sha=self.root
310
+ )
311
+ branch = self.root
312
+ data = {
313
+ "message": message or f"Delete {path}",
314
+ "sha": sha,
315
+ **({"branch": branch} if branch else {}),
316
+ }
317
+
318
+ r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
319
+ error_message = r.json().get("message", "")
320
+ if re.search(r"Branch .+ not found", error_message):
321
+ error = "Remove only works when the filesystem is initialised from a branch or default (None)"
322
+ raise ValueError(error)
323
+ r.raise_for_status()
324
+
325
+ self.invalidate_cache(path)
326
+
327
+ def _get_sha_from_cache(self, path):
328
+ for entries in self.dircache.values():
329
+ for entry in entries:
330
+ entry_path = entry.get("name")
331
+ if entry_path and entry_path == path and "sha" in entry:
332
+ return entry["sha"]
333
+ return None
venv/lib/python3.10/site-packages/fsspec/implementations/http.py ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import io
3
+ import logging
4
+ import re
5
+ import weakref
6
+ from copy import copy
7
+ from urllib.parse import urlparse
8
+
9
+ import aiohttp
10
+ import yarl
11
+
12
+ from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
13
+ from fsspec.callbacks import DEFAULT_CALLBACK
14
+ from fsspec.exceptions import FSTimeoutError
15
+ from fsspec.spec import AbstractBufferedFile
16
+ from fsspec.utils import (
17
+ DEFAULT_BLOCK_SIZE,
18
+ glob_translate,
19
+ isfilelike,
20
+ nullcontext,
21
+ tokenize,
22
+ )
23
+
24
+ from ..caching import AllBytes
25
+
26
+ # https://stackoverflow.com/a/15926317/3821154
27
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
28
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
29
+ logger = logging.getLogger("fsspec.http")
30
+
31
+
32
+ async def get_client(**kwargs):
33
+ return aiohttp.ClientSession(**kwargs)
34
+
35
+
36
+ class HTTPFileSystem(AsyncFileSystem):
37
+ """
38
+ Simple File-System for fetching data via HTTP(S)
39
+
40
+ ``ls()`` is implemented by loading the parent page and doing a regex
41
+ match on the result. If simple_link=True, anything of the form
42
+ "http(s)://server.com/stuff?thing=other"; otherwise only links within
43
+ HTML href tags will be used.
44
+ """
45
+
46
+ protocol = ("http", "https")
47
+ sep = "/"
48
+
49
+ def __init__(
50
+ self,
51
+ simple_links=True,
52
+ block_size=None,
53
+ same_scheme=True,
54
+ size_policy=None,
55
+ cache_type="bytes",
56
+ cache_options=None,
57
+ asynchronous=False,
58
+ loop=None,
59
+ client_kwargs=None,
60
+ get_client=get_client,
61
+ encoded=False,
62
+ **storage_options,
63
+ ):
64
+ """
65
+ NB: if this is called async, you must await set_client
66
+
67
+ Parameters
68
+ ----------
69
+ block_size: int
70
+ Blocks to read bytes; if 0, will default to raw requests file-like
71
+ objects instead of HTTPFile instances
72
+ simple_links: bool
73
+ If True, will consider both HTML <a> tags and anything that looks
74
+ like a URL; if False, will consider only the former.
75
+ same_scheme: True
76
+ When doing ls/glob, if this is True, only consider paths that have
77
+ http/https matching the input URLs.
78
+ size_policy: this argument is deprecated
79
+ client_kwargs: dict
80
+ Passed to aiohttp.ClientSession, see
81
+ https://docs.aiohttp.org/en/stable/client_reference.html
82
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
83
+ get_client: Callable[..., aiohttp.ClientSession]
84
+ A callable, which takes keyword arguments and constructs
85
+ an aiohttp.ClientSession. Its state will be managed by
86
+ the HTTPFileSystem class.
87
+ storage_options: key-value
88
+ Any other parameters passed on to requests
89
+ cache_type, cache_options: defaults used in open()
90
+ """
91
+ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
92
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
93
+ self.simple_links = simple_links
94
+ self.same_schema = same_scheme
95
+ self.cache_type = cache_type
96
+ self.cache_options = cache_options
97
+ self.client_kwargs = client_kwargs or {}
98
+ self.get_client = get_client
99
+ self.encoded = encoded
100
+ self.kwargs = storage_options
101
+ self._session = None
102
+
103
+ # Clean caching-related parameters from `storage_options`
104
+ # before propagating them as `request_options` through `self.kwargs`.
105
+ # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
106
+ # it clearer.
107
+ request_options = copy(storage_options)
108
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
109
+ request_options.pop("listings_expiry_time", None)
110
+ request_options.pop("max_paths", None)
111
+ request_options.pop("skip_instance_cache", None)
112
+ self.kwargs = request_options
113
+
114
+ @property
115
+ def fsid(self):
116
+ return "http"
117
+
118
+ def encode_url(self, url):
119
+ return yarl.URL(url, encoded=self.encoded)
120
+
121
+ @staticmethod
122
+ def close_session(loop, session):
123
+ if loop is not None and loop.is_running():
124
+ try:
125
+ sync(loop, session.close, timeout=0.1)
126
+ return
127
+ except (TimeoutError, FSTimeoutError, NotImplementedError):
128
+ pass
129
+ connector = getattr(session, "_connector", None)
130
+ if connector is not None:
131
+ # close after loop is dead
132
+ connector._close()
133
+
134
+ async def set_session(self):
135
+ if self._session is None:
136
+ self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
137
+ if not self.asynchronous:
138
+ weakref.finalize(self, self.close_session, self.loop, self._session)
139
+ return self._session
140
+
141
+ @classmethod
142
+ def _strip_protocol(cls, path):
143
+ """For HTTP, we always want to keep the full URL"""
144
+ return path
145
+
146
+ @classmethod
147
+ def _parent(cls, path):
148
+ # override, since _strip_protocol is different for URLs
149
+ par = super()._parent(path)
150
+ if len(par) > 7: # "http://..."
151
+ return par
152
+ return ""
153
+
154
+ async def _ls_real(self, url, detail=True, **kwargs):
155
+ # ignoring URL-encoded arguments
156
+ kw = self.kwargs.copy()
157
+ kw.update(kwargs)
158
+ logger.debug(url)
159
+ session = await self.set_session()
160
+ async with session.get(self.encode_url(url), **self.kwargs) as r:
161
+ self._raise_not_found_for_status(r, url)
162
+
163
+ if "Content-Type" in r.headers:
164
+ mimetype = r.headers["Content-Type"].partition(";")[0]
165
+ else:
166
+ mimetype = None
167
+
168
+ if mimetype in ("text/html", None):
169
+ try:
170
+ text = await r.text(errors="ignore")
171
+ if self.simple_links:
172
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
173
+ else:
174
+ links = [u[2] for u in ex.findall(text)]
175
+ except UnicodeDecodeError:
176
+ links = [] # binary, not HTML
177
+ else:
178
+ links = []
179
+
180
+ out = set()
181
+ parts = urlparse(url)
182
+ for l in links:
183
+ if isinstance(l, tuple):
184
+ l = l[1]
185
+ if l.startswith("/") and len(l) > 1:
186
+ # absolute URL on this server
187
+ l = f"{parts.scheme}://{parts.netloc}{l}"
188
+ if l.startswith("http"):
189
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
190
+ out.add(l)
191
+ elif l.replace("https", "http").startswith(
192
+ url.replace("https", "http").rstrip("/") + "/"
193
+ ):
194
+ # allowed to cross http <-> https
195
+ out.add(l)
196
+ else:
197
+ if l not in ["..", "../"]:
198
+ # Ignore FTP-like "parent"
199
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
200
+ if not out and url.endswith("/"):
201
+ out = await self._ls_real(url.rstrip("/"), detail=False)
202
+ if detail:
203
+ return [
204
+ {
205
+ "name": u,
206
+ "size": None,
207
+ "type": "directory" if u.endswith("/") else "file",
208
+ }
209
+ for u in out
210
+ ]
211
+ else:
212
+ return sorted(out)
213
+
214
+ async def _ls(self, url, detail=True, **kwargs):
215
+ if self.use_listings_cache and url in self.dircache:
216
+ out = self.dircache[url]
217
+ else:
218
+ out = await self._ls_real(url, detail=detail, **kwargs)
219
+ self.dircache[url] = out
220
+ return out
221
+
222
+ ls = sync_wrapper(_ls)
223
+
224
+ def _raise_not_found_for_status(self, response, url):
225
+ """
226
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
227
+ """
228
+ if response.status == 404:
229
+ raise FileNotFoundError(url)
230
+ response.raise_for_status()
231
+
232
+ async def _cat_file(self, url, start=None, end=None, **kwargs):
233
+ kw = self.kwargs.copy()
234
+ kw.update(kwargs)
235
+ logger.debug(url)
236
+
237
+ if start is not None or end is not None:
238
+ if start == end:
239
+ return b""
240
+ headers = kw.pop("headers", {}).copy()
241
+
242
+ headers["Range"] = await self._process_limits(url, start, end)
243
+ kw["headers"] = headers
244
+ session = await self.set_session()
245
+ async with session.get(self.encode_url(url), **kw) as r:
246
+ out = await r.read()
247
+ self._raise_not_found_for_status(r, url)
248
+ return out
249
+
250
+ async def _get_file(
251
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
252
+ ):
253
+ kw = self.kwargs.copy()
254
+ kw.update(kwargs)
255
+ logger.debug(rpath)
256
+ session = await self.set_session()
257
+ async with session.get(self.encode_url(rpath), **kw) as r:
258
+ try:
259
+ size = int(r.headers["content-length"])
260
+ except (ValueError, KeyError):
261
+ size = None
262
+
263
+ callback.set_size(size)
264
+ self._raise_not_found_for_status(r, rpath)
265
+ if isfilelike(lpath):
266
+ outfile = lpath
267
+ else:
268
+ outfile = open(lpath, "wb") # noqa: ASYNC230
269
+
270
+ try:
271
+ chunk = True
272
+ while chunk:
273
+ chunk = await r.content.read(chunk_size)
274
+ outfile.write(chunk)
275
+ callback.relative_update(len(chunk))
276
+ finally:
277
+ if not isfilelike(lpath):
278
+ outfile.close()
279
+
280
+ async def _put_file(
281
+ self,
282
+ lpath,
283
+ rpath,
284
+ chunk_size=5 * 2**20,
285
+ callback=DEFAULT_CALLBACK,
286
+ method="post",
287
+ mode="overwrite",
288
+ **kwargs,
289
+ ):
290
+ if mode != "overwrite":
291
+ raise NotImplementedError("Exclusive write")
292
+
293
+ async def gen_chunks():
294
+ # Support passing arbitrary file-like objects
295
+ # and use them instead of streams.
296
+ if isinstance(lpath, io.IOBase):
297
+ context = nullcontext(lpath)
298
+ use_seek = False # might not support seeking
299
+ else:
300
+ context = open(lpath, "rb") # noqa: ASYNC230
301
+ use_seek = True
302
+
303
+ with context as f:
304
+ if use_seek:
305
+ callback.set_size(f.seek(0, 2))
306
+ f.seek(0)
307
+ else:
308
+ callback.set_size(getattr(f, "size", None))
309
+
310
+ chunk = f.read(chunk_size)
311
+ while chunk:
312
+ yield chunk
313
+ callback.relative_update(len(chunk))
314
+ chunk = f.read(chunk_size)
315
+
316
+ kw = self.kwargs.copy()
317
+ kw.update(kwargs)
318
+ session = await self.set_session()
319
+
320
+ method = method.lower()
321
+ if method not in ("post", "put"):
322
+ raise ValueError(
323
+ f"method has to be either 'post' or 'put', not: {method!r}"
324
+ )
325
+
326
+ meth = getattr(session, method)
327
+ async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
328
+ self._raise_not_found_for_status(resp, rpath)
329
+
330
+ async def _exists(self, path, strict=False, **kwargs):
331
+ kw = self.kwargs.copy()
332
+ kw.update(kwargs)
333
+ try:
334
+ logger.debug(path)
335
+ session = await self.set_session()
336
+ r = await session.get(self.encode_url(path), **kw)
337
+ async with r:
338
+ if strict:
339
+ self._raise_not_found_for_status(r, path)
340
+ return r.status < 400
341
+ except FileNotFoundError:
342
+ return False
343
+ except aiohttp.ClientError:
344
+ if strict:
345
+ raise
346
+ return False
347
+
348
+ async def _isfile(self, path, **kwargs):
349
+ return await self._exists(path, **kwargs)
350
+
351
+ def _open(
352
+ self,
353
+ path,
354
+ mode="rb",
355
+ block_size=None,
356
+ autocommit=None, # XXX: This differs from the base class.
357
+ cache_type=None,
358
+ cache_options=None,
359
+ size=None,
360
+ **kwargs,
361
+ ):
362
+ """Make a file-like object
363
+
364
+ Parameters
365
+ ----------
366
+ path: str
367
+ Full URL with protocol
368
+ mode: string
369
+ must be "rb"
370
+ block_size: int or None
371
+ Bytes to download in one request; use instance value if None. If
372
+ zero, will return a streaming Requests file-like instance.
373
+ kwargs: key-value
374
+ Any other parameters, passed to requests calls
375
+ """
376
+ if mode != "rb":
377
+ raise NotImplementedError
378
+ block_size = block_size if block_size is not None else self.block_size
379
+ kw = self.kwargs.copy()
380
+ kw["asynchronous"] = self.asynchronous
381
+ kw.update(kwargs)
382
+ info = {}
383
+ size = size or info.update(self.info(path, **kwargs)) or info["size"]
384
+ session = sync(self.loop, self.set_session)
385
+ if block_size and size and info.get("partial", True):
386
+ return HTTPFile(
387
+ self,
388
+ path,
389
+ session=session,
390
+ block_size=block_size,
391
+ mode=mode,
392
+ size=size,
393
+ cache_type=cache_type or self.cache_type,
394
+ cache_options=cache_options or self.cache_options,
395
+ loop=self.loop,
396
+ **kw,
397
+ )
398
+ else:
399
+ return HTTPStreamFile(
400
+ self,
401
+ path,
402
+ mode=mode,
403
+ loop=self.loop,
404
+ session=session,
405
+ **kw,
406
+ )
407
+
408
+ async def open_async(self, path, mode="rb", size=None, **kwargs):
409
+ session = await self.set_session()
410
+ if size is None:
411
+ try:
412
+ size = (await self._info(path, **kwargs))["size"]
413
+ except FileNotFoundError:
414
+ pass
415
+ return AsyncStreamFile(
416
+ self,
417
+ path,
418
+ loop=self.loop,
419
+ session=session,
420
+ size=size,
421
+ **kwargs,
422
+ )
423
+
424
+ def ukey(self, url):
425
+ """Unique identifier; assume HTTP files are static, unchanging"""
426
+ return tokenize(url, self.kwargs, self.protocol)
427
+
428
+ async def _info(self, url, **kwargs):
429
+ """Get info of URL
430
+
431
+ Tries to access location via HEAD, and then GET methods, but does
432
+ not fetch the data.
433
+
434
+ It is possible that the server does not supply any size information, in
435
+ which case size will be given as None (and certain operations on the
436
+ corresponding file will not work).
437
+ """
438
+ info = {}
439
+ session = await self.set_session()
440
+
441
+ for policy in ["head", "get"]:
442
+ try:
443
+ info.update(
444
+ await _file_info(
445
+ self.encode_url(url),
446
+ size_policy=policy,
447
+ session=session,
448
+ **self.kwargs,
449
+ **kwargs,
450
+ )
451
+ )
452
+ if info.get("size") is not None:
453
+ break
454
+ except Exception as exc:
455
+ if policy == "get":
456
+ # If get failed, then raise a FileNotFoundError
457
+ raise FileNotFoundError(url) from exc
458
+ logger.debug("", exc_info=exc)
459
+
460
+ return {"name": url, "size": None, **info, "type": "file"}
461
+
462
+ async def _glob(self, path, maxdepth=None, **kwargs):
463
+ """
464
+ Find files by glob-matching.
465
+
466
+ This implementation is idntical to the one in AbstractFileSystem,
467
+ but "?" is not considered as a character for globbing, because it is
468
+ so common in URLs, often identifying the "query" part.
469
+ """
470
+ if maxdepth is not None and maxdepth < 1:
471
+ raise ValueError("maxdepth must be at least 1")
472
+ import re
473
+
474
+ ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
475
+ path = self._strip_protocol(path)
476
+ append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
477
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
478
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
479
+
480
+ min_idx = min(idx_star, idx_brace)
481
+
482
+ detail = kwargs.pop("detail", False)
483
+
484
+ if not has_magic(path):
485
+ if await self._exists(path, **kwargs):
486
+ if not detail:
487
+ return [path]
488
+ else:
489
+ return {path: await self._info(path, **kwargs)}
490
+ else:
491
+ if not detail:
492
+ return [] # glob of non-existent returns empty
493
+ else:
494
+ return {}
495
+ elif "/" in path[:min_idx]:
496
+ min_idx = path[:min_idx].rindex("/")
497
+ root = path[: min_idx + 1]
498
+ depth = path[min_idx + 1 :].count("/") + 1
499
+ else:
500
+ root = ""
501
+ depth = path[min_idx + 1 :].count("/") + 1
502
+
503
+ if "**" in path:
504
+ if maxdepth is not None:
505
+ idx_double_stars = path.find("**")
506
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
507
+ depth = depth - depth_double_stars + maxdepth
508
+ else:
509
+ depth = None
510
+
511
+ allpaths = await self._find(
512
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
513
+ )
514
+
515
+ pattern = glob_translate(path + ("/" if ends_with_slash else ""))
516
+ pattern = re.compile(pattern)
517
+
518
+ out = {
519
+ (
520
+ p.rstrip("/")
521
+ if not append_slash_to_dirname
522
+ and info["type"] == "directory"
523
+ and p.endswith("/")
524
+ else p
525
+ ): info
526
+ for p, info in sorted(allpaths.items())
527
+ if pattern.match(p.rstrip("/"))
528
+ }
529
+
530
+ if detail:
531
+ return out
532
+ else:
533
+ return list(out)
534
+
535
+ async def _isdir(self, path):
536
+ # override, since all URLs are (also) files
537
+ try:
538
+ return bool(await self._ls(path))
539
+ except (FileNotFoundError, ValueError):
540
+ return False
541
+
542
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
543
+ """
544
+ Write bytes to a remote file over HTTP.
545
+
546
+ Parameters
547
+ ----------
548
+ path : str
549
+ Target URL where the data should be written
550
+ value : bytes
551
+ Data to be written
552
+ mode : str
553
+ How to write to the file - 'overwrite' or 'append'
554
+ **kwargs : dict
555
+ Additional parameters to pass to the HTTP request
556
+ """
557
+ url = self._strip_protocol(path)
558
+ headers = kwargs.pop("headers", {})
559
+ headers["Content-Length"] = str(len(value))
560
+
561
+ session = await self.set_session()
562
+
563
+ async with session.put(url, data=value, headers=headers, **kwargs) as r:
564
+ r.raise_for_status()
565
+
566
+
567
+ class HTTPFile(AbstractBufferedFile):
568
+ """
569
+ A file-like object pointing to a remote HTTP(S) resource
570
+
571
+ Supports only reading, with read-ahead of a predetermined block-size.
572
+
573
+ In the case that the server does not supply the filesize, only reading of
574
+ the complete file in one go is supported.
575
+
576
+ Parameters
577
+ ----------
578
+ url: str
579
+ Full URL of the remote resource, including the protocol
580
+ session: aiohttp.ClientSession or None
581
+ All calls will be made within this session, to avoid restarting
582
+ connections where the server allows this
583
+ block_size: int or None
584
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
585
+ configured for the FileSystem creating this file
586
+ size: None or int
587
+ If given, this is the size of the file in bytes, and we don't attempt
588
+ to call the server to find the value.
589
+ kwargs: all other key-values are passed to requests calls.
590
+ """
591
+
592
+ def __init__(
593
+ self,
594
+ fs,
595
+ url,
596
+ session=None,
597
+ block_size=None,
598
+ mode="rb",
599
+ cache_type="bytes",
600
+ cache_options=None,
601
+ size=None,
602
+ loop=None,
603
+ asynchronous=False,
604
+ **kwargs,
605
+ ):
606
+ if mode != "rb":
607
+ raise NotImplementedError("File mode not supported")
608
+ self.asynchronous = asynchronous
609
+ self.loop = loop
610
+ self.url = url
611
+ self.session = session
612
+ self.details = {"name": url, "size": size, "type": "file"}
613
+ super().__init__(
614
+ fs=fs,
615
+ path=url,
616
+ mode=mode,
617
+ block_size=block_size,
618
+ cache_type=cache_type,
619
+ cache_options=cache_options,
620
+ **kwargs,
621
+ )
622
+
623
+ def read(self, length=-1):
624
+ """Read bytes from file
625
+
626
+ Parameters
627
+ ----------
628
+ length: int
629
+ Read up to this many bytes. If negative, read all content to end of
630
+ file. If the server has not supplied the filesize, attempting to
631
+ read only part of the data will raise a ValueError.
632
+ """
633
+ if (
634
+ (length < 0 and self.loc == 0) # explicit read all
635
+ # but not when the size is known and fits into a block anyways
636
+ and not (self.size is not None and self.size <= self.blocksize)
637
+ ):
638
+ self._fetch_all()
639
+ if self.size is None:
640
+ if length < 0:
641
+ self._fetch_all()
642
+ else:
643
+ length = min(self.size - self.loc, length)
644
+ return super().read(length)
645
+
646
+ async def async_fetch_all(self):
647
+ """Read whole file in one shot, without caching
648
+
649
+ This is only called when position is still at zero,
650
+ and read() is called without a byte-count.
651
+ """
652
+ logger.debug(f"Fetch all for {self}")
653
+ if not isinstance(self.cache, AllBytes):
654
+ r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
655
+ async with r:
656
+ r.raise_for_status()
657
+ out = await r.read()
658
+ self.cache = AllBytes(
659
+ size=len(out), fetcher=None, blocksize=None, data=out
660
+ )
661
+ self.size = len(out)
662
+
663
+ _fetch_all = sync_wrapper(async_fetch_all)
664
+
665
+ def _parse_content_range(self, headers):
666
+ """Parse the Content-Range header"""
667
+ s = headers.get("Content-Range", "")
668
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
669
+ if not m:
670
+ return None, None, None
671
+
672
+ if m[1] == "*":
673
+ start = end = None
674
+ else:
675
+ start, end = [int(x) for x in m[1].split("-")]
676
+ total = None if m[2] == "*" else int(m[2])
677
+ return start, end, total
678
+
679
+ async def async_fetch_range(self, start, end):
680
+ """Download a block of data
681
+
682
+ The expectation is that the server returns only the requested bytes,
683
+ with HTTP code 206. If this is not the case, we first check the headers,
684
+ and then stream the output - if the data size is bigger than we
685
+ requested, an exception is raised.
686
+ """
687
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
688
+ kwargs = self.kwargs.copy()
689
+ headers = kwargs.pop("headers", {}).copy()
690
+ headers["Range"] = f"bytes={start}-{end - 1}"
691
+ logger.debug(f"{self.url} : {headers['Range']}")
692
+ r = await self.session.get(
693
+ self.fs.encode_url(self.url), headers=headers, **kwargs
694
+ )
695
+ async with r:
696
+ if r.status == 416:
697
+ # range request outside file
698
+ return b""
699
+ r.raise_for_status()
700
+
701
+ # If the server has handled the range request, it should reply
702
+ # with status 206 (partial content). But we'll guess that a suitable
703
+ # Content-Range header or a Content-Length no more than the
704
+ # requested range also mean we have got the desired range.
705
+ response_is_range = (
706
+ r.status == 206
707
+ or self._parse_content_range(r.headers)[0] == start
708
+ or int(r.headers.get("Content-Length", end + 1)) <= end - start
709
+ )
710
+
711
+ if response_is_range:
712
+ # partial content, as expected
713
+ out = await r.read()
714
+ elif start > 0:
715
+ raise ValueError(
716
+ "The HTTP server doesn't appear to support range requests. "
717
+ "Only reading this file from the beginning is supported. "
718
+ "Open with block_size=0 for a streaming file interface."
719
+ )
720
+ else:
721
+ # Response is not a range, but we want the start of the file,
722
+ # so we can read the required amount anyway.
723
+ cl = 0
724
+ out = []
725
+ while True:
726
+ chunk = await r.content.read(2**20)
727
+ # data size unknown, let's read until we have enough
728
+ if chunk:
729
+ out.append(chunk)
730
+ cl += len(chunk)
731
+ if cl > end - start:
732
+ break
733
+ else:
734
+ break
735
+ out = b"".join(out)[: end - start]
736
+ return out
737
+
738
+ _fetch_range = sync_wrapper(async_fetch_range)
739
+
740
+
741
+ magic_check = re.compile("([*[])")
742
+
743
+
744
+ def has_magic(s):
745
+ match = magic_check.search(s)
746
+ return match is not None
747
+
748
+
749
+ class HTTPStreamFile(AbstractBufferedFile):
750
+ def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
751
+ self.asynchronous = kwargs.pop("asynchronous", False)
752
+ self.url = url
753
+ self.loop = loop
754
+ self.session = session
755
+ if mode != "rb":
756
+ raise ValueError
757
+ self.details = {"name": url, "size": None}
758
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
759
+
760
+ async def cor():
761
+ r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
762
+ self.fs._raise_not_found_for_status(r, url)
763
+ return r
764
+
765
+ self.r = sync(self.loop, cor)
766
+ self.loop = fs.loop
767
+
768
+ def seek(self, loc, whence=0):
769
+ if loc == 0 and whence == 1:
770
+ return
771
+ if loc == self.loc and whence == 0:
772
+ return
773
+ raise ValueError("Cannot seek streaming HTTP file")
774
+
775
+ async def _read(self, num=-1):
776
+ out = await self.r.content.read(num)
777
+ self.loc += len(out)
778
+ return out
779
+
780
+ read = sync_wrapper(_read)
781
+
782
+ async def _close(self):
783
+ self.r.close()
784
+
785
+ def close(self):
786
+ asyncio.run_coroutine_threadsafe(self._close(), self.loop)
787
+ super().close()
788
+
789
+
790
+ class AsyncStreamFile(AbstractAsyncStreamedFile):
791
+ def __init__(
792
+ self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
793
+ ):
794
+ self.url = url
795
+ self.session = session
796
+ self.r = None
797
+ if mode != "rb":
798
+ raise ValueError
799
+ self.details = {"name": url, "size": None}
800
+ self.kwargs = kwargs
801
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
802
+ self.size = size
803
+
804
+ async def read(self, num=-1):
805
+ if self.r is None:
806
+ r = await self.session.get(
807
+ self.fs.encode_url(self.url), **self.kwargs
808
+ ).__aenter__()
809
+ self.fs._raise_not_found_for_status(r, self.url)
810
+ self.r = r
811
+ out = await self.r.content.read(num)
812
+ self.loc += len(out)
813
+ return out
814
+
815
+ async def close(self):
816
+ if self.r is not None:
817
+ self.r.close()
818
+ self.r = None
819
+ await super().close()
820
+
821
+
822
+ async def get_range(session, url, start, end, file=None, **kwargs):
823
+ # explicit get a range when we know it must be safe
824
+ kwargs = kwargs.copy()
825
+ headers = kwargs.pop("headers", {}).copy()
826
+ headers["Range"] = f"bytes={start}-{end - 1}"
827
+ r = await session.get(url, headers=headers, **kwargs)
828
+ r.raise_for_status()
829
+ async with r:
830
+ out = await r.read()
831
+ if file:
832
+ with open(file, "r+b") as f: # noqa: ASYNC230
833
+ f.seek(start)
834
+ f.write(out)
835
+ else:
836
+ return out
837
+
838
+
839
+ async def _file_info(url, session, size_policy="head", **kwargs):
840
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
841
+
842
+ Default operation is to explicitly allow redirects and use encoding
843
+ 'identity' (no compression) to get the true size of the target.
844
+ """
845
+ logger.debug("Retrieve file size for %s", url)
846
+ kwargs = kwargs.copy()
847
+ ar = kwargs.pop("allow_redirects", True)
848
+ head = kwargs.get("headers", {}).copy()
849
+ head["Accept-Encoding"] = "identity"
850
+ kwargs["headers"] = head
851
+
852
+ info = {}
853
+ if size_policy == "head":
854
+ r = await session.head(url, allow_redirects=ar, **kwargs)
855
+ elif size_policy == "get":
856
+ r = await session.get(url, allow_redirects=ar, **kwargs)
857
+ else:
858
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
859
+ async with r:
860
+ r.raise_for_status()
861
+
862
+ if "Content-Length" in r.headers:
863
+ # Some servers may choose to ignore Accept-Encoding and return
864
+ # compressed content, in which case the returned size is unreliable.
865
+ if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
866
+ "identity",
867
+ "",
868
+ ]:
869
+ info["size"] = int(r.headers["Content-Length"])
870
+ elif "Content-Range" in r.headers:
871
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
872
+
873
+ if "Content-Type" in r.headers:
874
+ info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
875
+
876
+ if r.headers.get("Accept-Ranges") == "none":
877
+ # Some servers may explicitly discourage partial content requests, but
878
+ # the lack of "Accept-Ranges" does not always indicate they would fail
879
+ info["partial"] = False
880
+
881
+ info["url"] = str(r.url)
882
+
883
+ for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
884
+ if r.headers.get(checksum_field):
885
+ info[checksum_field] = r.headers[checksum_field]
886
+
887
+ return info
888
+
889
+
890
+ async def _file_size(url, session=None, *args, **kwargs):
891
+ if session is None:
892
+ session = await get_client()
893
+ info = await _file_info(url, session=session, *args, **kwargs)
894
+ return info.get("size")
895
+
896
+
897
+ file_size = sync_wrapper(_file_size)
venv/lib/python3.10/site-packages/fsspec/implementations/http_sync.py ADDED
@@ -0,0 +1,937 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This file is largely copied from http.py"""
2
+
3
+ import io
4
+ import logging
5
+ import re
6
+ import urllib.error
7
+ import urllib.parse
8
+ from copy import copy
9
+ from json import dumps, loads
10
+ from urllib.parse import urlparse
11
+
12
+ try:
13
+ import yarl
14
+ except (ImportError, ModuleNotFoundError, OSError):
15
+ yarl = False
16
+
17
+ from fsspec.callbacks import _DEFAULT_CALLBACK
18
+ from fsspec.registry import register_implementation
19
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
20
+ from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
21
+
22
+ from ..caching import AllBytes
23
+
24
+ # https://stackoverflow.com/a/15926317/3821154
25
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
26
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
27
+ logger = logging.getLogger("fsspec.http")
28
+
29
+
30
+ class JsHttpException(urllib.error.HTTPError): ...
31
+
32
+
33
+ class StreamIO(io.BytesIO):
34
+ # fake class, so you can set attributes on it
35
+ # will eventually actually stream
36
+ ...
37
+
38
+
39
+ class ResponseProxy:
40
+ """Looks like a requests response"""
41
+
42
+ def __init__(self, req, stream=False):
43
+ self.request = req
44
+ self.stream = stream
45
+ self._data = None
46
+ self._headers = None
47
+
48
+ @property
49
+ def raw(self):
50
+ if self._data is None:
51
+ b = self.request.response.to_bytes()
52
+ if self.stream:
53
+ self._data = StreamIO(b)
54
+ else:
55
+ self._data = b
56
+ return self._data
57
+
58
+ def close(self):
59
+ if hasattr(self, "_data"):
60
+ del self._data
61
+
62
+ @property
63
+ def headers(self):
64
+ if self._headers is None:
65
+ self._headers = dict(
66
+ [
67
+ _.split(": ")
68
+ for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
69
+ ]
70
+ )
71
+ return self._headers
72
+
73
+ @property
74
+ def status_code(self):
75
+ return int(self.request.status)
76
+
77
+ def raise_for_status(self):
78
+ if not self.ok:
79
+ raise JsHttpException(
80
+ self.url, self.status_code, self.reason, self.headers, None
81
+ )
82
+
83
+ def iter_content(self, chunksize, *_, **__):
84
+ while True:
85
+ out = self.raw.read(chunksize)
86
+ if out:
87
+ yield out
88
+ else:
89
+ break
90
+
91
+ @property
92
+ def reason(self):
93
+ return self.request.statusText
94
+
95
+ @property
96
+ def ok(self):
97
+ return self.status_code < 400
98
+
99
+ @property
100
+ def url(self):
101
+ return self.request.response.responseURL
102
+
103
+ @property
104
+ def text(self):
105
+ # TODO: encoding from headers
106
+ return self.content.decode()
107
+
108
+ @property
109
+ def content(self):
110
+ self.stream = False
111
+ return self.raw
112
+
113
+ def json(self):
114
+ return loads(self.text)
115
+
116
+
117
+ class RequestsSessionShim:
118
+ def __init__(self):
119
+ self.headers = {}
120
+
121
+ def request(
122
+ self,
123
+ method,
124
+ url,
125
+ params=None,
126
+ data=None,
127
+ headers=None,
128
+ cookies=None,
129
+ files=None,
130
+ auth=None,
131
+ timeout=None,
132
+ allow_redirects=None,
133
+ proxies=None,
134
+ hooks=None,
135
+ stream=None,
136
+ verify=None,
137
+ cert=None,
138
+ json=None,
139
+ ):
140
+ from js import Blob, XMLHttpRequest
141
+
142
+ logger.debug("JS request: %s %s", method, url)
143
+
144
+ if cert or verify or proxies or files or cookies or hooks:
145
+ raise NotImplementedError
146
+ if data and json:
147
+ raise ValueError("Use json= or data=, not both")
148
+ req = XMLHttpRequest.new()
149
+ extra = auth if auth else ()
150
+ if params:
151
+ url = f"{url}?{urllib.parse.urlencode(params)}"
152
+ req.open(method, url, False, *extra)
153
+ if timeout:
154
+ req.timeout = timeout
155
+ if headers:
156
+ for k, v in headers.items():
157
+ req.setRequestHeader(k, v)
158
+
159
+ req.setRequestHeader("Accept", "application/octet-stream")
160
+ req.responseType = "arraybuffer"
161
+ if json:
162
+ blob = Blob.new([dumps(data)], {type: "application/json"})
163
+ req.send(blob)
164
+ elif data:
165
+ if isinstance(data, io.IOBase):
166
+ data = data.read()
167
+ blob = Blob.new([data], {type: "application/octet-stream"})
168
+ req.send(blob)
169
+ else:
170
+ req.send(None)
171
+ return ResponseProxy(req, stream=stream)
172
+
173
+ def get(self, url, **kwargs):
174
+ return self.request("GET", url, **kwargs)
175
+
176
+ def head(self, url, **kwargs):
177
+ return self.request("HEAD", url, **kwargs)
178
+
179
+ def post(self, url, **kwargs):
180
+ return self.request("POST}", url, **kwargs)
181
+
182
+ def put(self, url, **kwargs):
183
+ return self.request("PUT", url, **kwargs)
184
+
185
+ def patch(self, url, **kwargs):
186
+ return self.request("PATCH", url, **kwargs)
187
+
188
+ def delete(self, url, **kwargs):
189
+ return self.request("DELETE", url, **kwargs)
190
+
191
+
192
+ class HTTPFileSystem(AbstractFileSystem):
193
+ """
194
+ Simple File-System for fetching data via HTTP(S)
195
+
196
+ This is the BLOCKING version of the normal HTTPFileSystem. It uses
197
+ requests in normal python and the JS runtime in pyodide.
198
+
199
+ ***This implementation is extremely experimental, do not use unless
200
+ you are testing pyodide/pyscript integration***
201
+ """
202
+
203
+ protocol = ("http", "https", "sync-http", "sync-https")
204
+ sep = "/"
205
+
206
+ def __init__(
207
+ self,
208
+ simple_links=True,
209
+ block_size=None,
210
+ same_scheme=True,
211
+ cache_type="readahead",
212
+ cache_options=None,
213
+ client_kwargs=None,
214
+ encoded=False,
215
+ **storage_options,
216
+ ):
217
+ """
218
+
219
+ Parameters
220
+ ----------
221
+ block_size: int
222
+ Blocks to read bytes; if 0, will default to raw requests file-like
223
+ objects instead of HTTPFile instances
224
+ simple_links: bool
225
+ If True, will consider both HTML <a> tags and anything that looks
226
+ like a URL; if False, will consider only the former.
227
+ same_scheme: True
228
+ When doing ls/glob, if this is True, only consider paths that have
229
+ http/https matching the input URLs.
230
+ size_policy: this argument is deprecated
231
+ client_kwargs: dict
232
+ Passed to aiohttp.ClientSession, see
233
+ https://docs.aiohttp.org/en/stable/client_reference.html
234
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
235
+ storage_options: key-value
236
+ Any other parameters passed on to requests
237
+ cache_type, cache_options: defaults used in open
238
+ """
239
+ super().__init__(self, **storage_options)
240
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
241
+ self.simple_links = simple_links
242
+ self.same_schema = same_scheme
243
+ self.cache_type = cache_type
244
+ self.cache_options = cache_options
245
+ self.client_kwargs = client_kwargs or {}
246
+ self.encoded = encoded
247
+ self.kwargs = storage_options
248
+
249
+ try:
250
+ import js # noqa: F401
251
+
252
+ logger.debug("Starting JS session")
253
+ self.session = RequestsSessionShim()
254
+ self.js = True
255
+ except Exception as e:
256
+ import requests
257
+
258
+ logger.debug("Starting cpython session because of: %s", e)
259
+ self.session = requests.Session(**(client_kwargs or {}))
260
+ self.js = False
261
+
262
+ request_options = copy(storage_options)
263
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
264
+ request_options.pop("listings_expiry_time", None)
265
+ request_options.pop("max_paths", None)
266
+ request_options.pop("skip_instance_cache", None)
267
+ self.kwargs = request_options
268
+
269
+ @property
270
+ def fsid(self):
271
+ return "sync-http"
272
+
273
+ def encode_url(self, url):
274
+ if yarl:
275
+ return yarl.URL(url, encoded=self.encoded)
276
+ return url
277
+
278
+ @classmethod
279
+ def _strip_protocol(cls, path: str) -> str:
280
+ """For HTTP, we always want to keep the full URL"""
281
+ path = path.replace("sync-http://", "http://").replace(
282
+ "sync-https://", "https://"
283
+ )
284
+ return path
285
+
286
+ @classmethod
287
+ def _parent(cls, path):
288
+ # override, since _strip_protocol is different for URLs
289
+ par = super()._parent(path)
290
+ if len(par) > 7: # "http://..."
291
+ return par
292
+ return ""
293
+
294
+ def _ls_real(self, url, detail=True, **kwargs):
295
+ # ignoring URL-encoded arguments
296
+ kw = self.kwargs.copy()
297
+ kw.update(kwargs)
298
+ logger.debug(url)
299
+ r = self.session.get(self.encode_url(url), **self.kwargs)
300
+ self._raise_not_found_for_status(r, url)
301
+ text = r.text
302
+ if self.simple_links:
303
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
304
+ else:
305
+ links = [u[2] for u in ex.findall(text)]
306
+ out = set()
307
+ parts = urlparse(url)
308
+ for l in links:
309
+ if isinstance(l, tuple):
310
+ l = l[1]
311
+ if l.startswith("/") and len(l) > 1:
312
+ # absolute URL on this server
313
+ l = parts.scheme + "://" + parts.netloc + l
314
+ if l.startswith("http"):
315
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
316
+ out.add(l)
317
+ elif l.replace("https", "http").startswith(
318
+ url.replace("https", "http").rstrip("/") + "/"
319
+ ):
320
+ # allowed to cross http <-> https
321
+ out.add(l)
322
+ else:
323
+ if l not in ["..", "../"]:
324
+ # Ignore FTP-like "parent"
325
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
326
+ if not out and url.endswith("/"):
327
+ out = self._ls_real(url.rstrip("/"), detail=False)
328
+ if detail:
329
+ return [
330
+ {
331
+ "name": u,
332
+ "size": None,
333
+ "type": "directory" if u.endswith("/") else "file",
334
+ }
335
+ for u in out
336
+ ]
337
+ else:
338
+ return sorted(out)
339
+
340
+ def ls(self, url, detail=True, **kwargs):
341
+ if self.use_listings_cache and url in self.dircache:
342
+ out = self.dircache[url]
343
+ else:
344
+ out = self._ls_real(url, detail=detail, **kwargs)
345
+ self.dircache[url] = out
346
+ return out
347
+
348
+ def _raise_not_found_for_status(self, response, url):
349
+ """
350
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
351
+ """
352
+ if response.status_code == 404:
353
+ raise FileNotFoundError(url)
354
+ response.raise_for_status()
355
+
356
+ def cat_file(self, url, start=None, end=None, **kwargs):
357
+ kw = self.kwargs.copy()
358
+ kw.update(kwargs)
359
+ logger.debug(url)
360
+
361
+ if start is not None or end is not None:
362
+ if start == end:
363
+ return b""
364
+ headers = kw.pop("headers", {}).copy()
365
+
366
+ headers["Range"] = self._process_limits(url, start, end)
367
+ kw["headers"] = headers
368
+ r = self.session.get(self.encode_url(url), **kw)
369
+ self._raise_not_found_for_status(r, url)
370
+ return r.content
371
+
372
+ def get_file(
373
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
374
+ ):
375
+ kw = self.kwargs.copy()
376
+ kw.update(kwargs)
377
+ logger.debug(rpath)
378
+ r = self.session.get(self.encode_url(rpath), **kw)
379
+ try:
380
+ size = int(
381
+ r.headers.get("content-length", None)
382
+ or r.headers.get("Content-Length", None)
383
+ )
384
+ except (ValueError, KeyError, TypeError):
385
+ size = None
386
+
387
+ callback.set_size(size)
388
+ self._raise_not_found_for_status(r, rpath)
389
+ if not isfilelike(lpath):
390
+ lpath = open(lpath, "wb")
391
+ for chunk in r.iter_content(chunk_size, decode_unicode=False):
392
+ lpath.write(chunk)
393
+ callback.relative_update(len(chunk))
394
+
395
+ def put_file(
396
+ self,
397
+ lpath,
398
+ rpath,
399
+ chunk_size=5 * 2**20,
400
+ callback=_DEFAULT_CALLBACK,
401
+ method="post",
402
+ **kwargs,
403
+ ):
404
+ def gen_chunks():
405
+ # Support passing arbitrary file-like objects
406
+ # and use them instead of streams.
407
+ if isinstance(lpath, io.IOBase):
408
+ context = nullcontext(lpath)
409
+ use_seek = False # might not support seeking
410
+ else:
411
+ context = open(lpath, "rb")
412
+ use_seek = True
413
+
414
+ with context as f:
415
+ if use_seek:
416
+ callback.set_size(f.seek(0, 2))
417
+ f.seek(0)
418
+ else:
419
+ callback.set_size(getattr(f, "size", None))
420
+
421
+ chunk = f.read(chunk_size)
422
+ while chunk:
423
+ yield chunk
424
+ callback.relative_update(len(chunk))
425
+ chunk = f.read(chunk_size)
426
+
427
+ kw = self.kwargs.copy()
428
+ kw.update(kwargs)
429
+
430
+ method = method.lower()
431
+ if method not in ("post", "put"):
432
+ raise ValueError(
433
+ f"method has to be either 'post' or 'put', not: {method!r}"
434
+ )
435
+
436
+ meth = getattr(self.session, method)
437
+ resp = meth(rpath, data=gen_chunks(), **kw)
438
+ self._raise_not_found_for_status(resp, rpath)
439
+
440
+ def _process_limits(self, url, start, end):
441
+ """Helper for "Range"-based _cat_file"""
442
+ size = None
443
+ suff = False
444
+ if start is not None and start < 0:
445
+ # if start is negative and end None, end is the "suffix length"
446
+ if end is None:
447
+ end = -start
448
+ start = ""
449
+ suff = True
450
+ else:
451
+ size = size or self.info(url)["size"]
452
+ start = size + start
453
+ elif start is None:
454
+ start = 0
455
+ if not suff:
456
+ if end is not None and end < 0:
457
+ if start is not None:
458
+ size = size or self.info(url)["size"]
459
+ end = size + end
460
+ elif end is None:
461
+ end = ""
462
+ if isinstance(end, int):
463
+ end -= 1 # bytes range is inclusive
464
+ return f"bytes={start}-{end}"
465
+
466
+ def exists(self, path, strict=False, **kwargs):
467
+ kw = self.kwargs.copy()
468
+ kw.update(kwargs)
469
+ try:
470
+ logger.debug(path)
471
+ r = self.session.get(self.encode_url(path), **kw)
472
+ if strict:
473
+ self._raise_not_found_for_status(r, path)
474
+ return r.status_code < 400
475
+ except FileNotFoundError:
476
+ return False
477
+ except Exception:
478
+ if strict:
479
+ raise
480
+ return False
481
+
482
+ def isfile(self, path, **kwargs):
483
+ return self.exists(path, **kwargs)
484
+
485
+ def _open(
486
+ self,
487
+ path,
488
+ mode="rb",
489
+ block_size=None,
490
+ autocommit=None, # XXX: This differs from the base class.
491
+ cache_type=None,
492
+ cache_options=None,
493
+ size=None,
494
+ **kwargs,
495
+ ):
496
+ """Make a file-like object
497
+
498
+ Parameters
499
+ ----------
500
+ path: str
501
+ Full URL with protocol
502
+ mode: string
503
+ must be "rb"
504
+ block_size: int or None
505
+ Bytes to download in one request; use instance value if None. If
506
+ zero, will return a streaming Requests file-like instance.
507
+ kwargs: key-value
508
+ Any other parameters, passed to requests calls
509
+ """
510
+ if mode != "rb":
511
+ raise NotImplementedError
512
+ block_size = block_size if block_size is not None else self.block_size
513
+ kw = self.kwargs.copy()
514
+ kw.update(kwargs)
515
+ size = size or self.info(path, **kwargs)["size"]
516
+ if block_size and size:
517
+ return HTTPFile(
518
+ self,
519
+ path,
520
+ session=self.session,
521
+ block_size=block_size,
522
+ mode=mode,
523
+ size=size,
524
+ cache_type=cache_type or self.cache_type,
525
+ cache_options=cache_options or self.cache_options,
526
+ **kw,
527
+ )
528
+ else:
529
+ return HTTPStreamFile(
530
+ self,
531
+ path,
532
+ mode=mode,
533
+ session=self.session,
534
+ **kw,
535
+ )
536
+
537
+ def ukey(self, url):
538
+ """Unique identifier; assume HTTP files are static, unchanging"""
539
+ return tokenize(url, self.kwargs, self.protocol)
540
+
541
+ def info(self, url, **kwargs):
542
+ """Get info of URL
543
+
544
+ Tries to access location via HEAD, and then GET methods, but does
545
+ not fetch the data.
546
+
547
+ It is possible that the server does not supply any size information, in
548
+ which case size will be given as None (and certain operations on the
549
+ corresponding file will not work).
550
+ """
551
+ info = {}
552
+ for policy in ["head", "get"]:
553
+ try:
554
+ info.update(
555
+ _file_info(
556
+ self.encode_url(url),
557
+ size_policy=policy,
558
+ session=self.session,
559
+ **self.kwargs,
560
+ **kwargs,
561
+ )
562
+ )
563
+ if info.get("size") is not None:
564
+ break
565
+ except Exception as exc:
566
+ if policy == "get":
567
+ # If get failed, then raise a FileNotFoundError
568
+ raise FileNotFoundError(url) from exc
569
+ logger.debug(str(exc))
570
+
571
+ return {"name": url, "size": None, **info, "type": "file"}
572
+
573
+ def glob(self, path, maxdepth=None, **kwargs):
574
+ """
575
+ Find files by glob-matching.
576
+
577
+ This implementation is idntical to the one in AbstractFileSystem,
578
+ but "?" is not considered as a character for globbing, because it is
579
+ so common in URLs, often identifying the "query" part.
580
+ """
581
+ import re
582
+
583
+ ends = path.endswith("/")
584
+ path = self._strip_protocol(path)
585
+ indstar = path.find("*") if path.find("*") >= 0 else len(path)
586
+ indbrace = path.find("[") if path.find("[") >= 0 else len(path)
587
+
588
+ ind = min(indstar, indbrace)
589
+
590
+ detail = kwargs.pop("detail", False)
591
+
592
+ if not has_magic(path):
593
+ root = path
594
+ depth = 1
595
+ if ends:
596
+ path += "/*"
597
+ elif self.exists(path):
598
+ if not detail:
599
+ return [path]
600
+ else:
601
+ return {path: self.info(path)}
602
+ else:
603
+ if not detail:
604
+ return [] # glob of non-existent returns empty
605
+ else:
606
+ return {}
607
+ elif "/" in path[:ind]:
608
+ ind2 = path[:ind].rindex("/")
609
+ root = path[: ind2 + 1]
610
+ depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
611
+ else:
612
+ root = ""
613
+ depth = None if "**" in path else path[ind + 1 :].count("/") + 1
614
+
615
+ allpaths = self.find(
616
+ root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
617
+ )
618
+ # Escape characters special to python regex, leaving our supported
619
+ # special characters in place.
620
+ # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
621
+ # for shell globbing details.
622
+ pattern = (
623
+ "^"
624
+ + (
625
+ path.replace("\\", r"\\")
626
+ .replace(".", r"\.")
627
+ .replace("+", r"\+")
628
+ .replace("//", "/")
629
+ .replace("(", r"\(")
630
+ .replace(")", r"\)")
631
+ .replace("|", r"\|")
632
+ .replace("^", r"\^")
633
+ .replace("$", r"\$")
634
+ .replace("{", r"\{")
635
+ .replace("}", r"\}")
636
+ .rstrip("/")
637
+ )
638
+ + "$"
639
+ )
640
+ pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
641
+ pattern = re.sub("[*]", "[^/]*", pattern)
642
+ pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
643
+ out = {
644
+ p: allpaths[p]
645
+ for p in sorted(allpaths)
646
+ if pattern.match(p.replace("//", "/").rstrip("/"))
647
+ }
648
+ if detail:
649
+ return out
650
+ else:
651
+ return list(out)
652
+
653
+ def isdir(self, path):
654
+ # override, since all URLs are (also) files
655
+ try:
656
+ return bool(self.ls(path))
657
+ except (FileNotFoundError, ValueError):
658
+ return False
659
+
660
+
661
+ class HTTPFile(AbstractBufferedFile):
662
+ """
663
+ A file-like object pointing to a remove HTTP(S) resource
664
+
665
+ Supports only reading, with read-ahead of a predermined block-size.
666
+
667
+ In the case that the server does not supply the filesize, only reading of
668
+ the complete file in one go is supported.
669
+
670
+ Parameters
671
+ ----------
672
+ url: str
673
+ Full URL of the remote resource, including the protocol
674
+ session: requests.Session or None
675
+ All calls will be made within this session, to avoid restarting
676
+ connections where the server allows this
677
+ block_size: int or None
678
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
679
+ configured for the FileSystem creating this file
680
+ size: None or int
681
+ If given, this is the size of the file in bytes, and we don't attempt
682
+ to call the server to find the value.
683
+ kwargs: all other key-values are passed to requests calls.
684
+ """
685
+
686
+ def __init__(
687
+ self,
688
+ fs,
689
+ url,
690
+ session=None,
691
+ block_size=None,
692
+ mode="rb",
693
+ cache_type="bytes",
694
+ cache_options=None,
695
+ size=None,
696
+ **kwargs,
697
+ ):
698
+ if mode != "rb":
699
+ raise NotImplementedError("File mode not supported")
700
+ self.url = url
701
+ self.session = session
702
+ self.details = {"name": url, "size": size, "type": "file"}
703
+ super().__init__(
704
+ fs=fs,
705
+ path=url,
706
+ mode=mode,
707
+ block_size=block_size,
708
+ cache_type=cache_type,
709
+ cache_options=cache_options,
710
+ **kwargs,
711
+ )
712
+
713
+ def read(self, length=-1):
714
+ """Read bytes from file
715
+
716
+ Parameters
717
+ ----------
718
+ length: int
719
+ Read up to this many bytes. If negative, read all content to end of
720
+ file. If the server has not supplied the filesize, attempting to
721
+ read only part of the data will raise a ValueError.
722
+ """
723
+ if (
724
+ (length < 0 and self.loc == 0) # explicit read all
725
+ # but not when the size is known and fits into a block anyways
726
+ and not (self.size is not None and self.size <= self.blocksize)
727
+ ):
728
+ self._fetch_all()
729
+ if self.size is None:
730
+ if length < 0:
731
+ self._fetch_all()
732
+ else:
733
+ length = min(self.size - self.loc, length)
734
+ return super().read(length)
735
+
736
+ def _fetch_all(self):
737
+ """Read whole file in one shot, without caching
738
+
739
+ This is only called when position is still at zero,
740
+ and read() is called without a byte-count.
741
+ """
742
+ logger.debug(f"Fetch all for {self}")
743
+ if not isinstance(self.cache, AllBytes):
744
+ r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
745
+ r.raise_for_status()
746
+ out = r.content
747
+ self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
748
+ self.size = len(out)
749
+
750
+ def _parse_content_range(self, headers):
751
+ """Parse the Content-Range header"""
752
+ s = headers.get("Content-Range", "")
753
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
754
+ if not m:
755
+ return None, None, None
756
+
757
+ if m[1] == "*":
758
+ start = end = None
759
+ else:
760
+ start, end = [int(x) for x in m[1].split("-")]
761
+ total = None if m[2] == "*" else int(m[2])
762
+ return start, end, total
763
+
764
+ def _fetch_range(self, start, end):
765
+ """Download a block of data
766
+
767
+ The expectation is that the server returns only the requested bytes,
768
+ with HTTP code 206. If this is not the case, we first check the headers,
769
+ and then stream the output - if the data size is bigger than we
770
+ requested, an exception is raised.
771
+ """
772
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
773
+ kwargs = self.kwargs.copy()
774
+ headers = kwargs.pop("headers", {}).copy()
775
+ headers["Range"] = f"bytes={start}-{end - 1}"
776
+ logger.debug("%s : %s", self.url, headers["Range"])
777
+ r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
778
+ if r.status_code == 416:
779
+ # range request outside file
780
+ return b""
781
+ r.raise_for_status()
782
+
783
+ # If the server has handled the range request, it should reply
784
+ # with status 206 (partial content). But we'll guess that a suitable
785
+ # Content-Range header or a Content-Length no more than the
786
+ # requested range also mean we have got the desired range.
787
+ cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
788
+ response_is_range = (
789
+ r.status_code == 206
790
+ or self._parse_content_range(r.headers)[0] == start
791
+ or int(cl) <= end - start
792
+ )
793
+
794
+ if response_is_range:
795
+ # partial content, as expected
796
+ out = r.content
797
+ elif start > 0:
798
+ raise ValueError(
799
+ "The HTTP server doesn't appear to support range requests. "
800
+ "Only reading this file from the beginning is supported. "
801
+ "Open with block_size=0 for a streaming file interface."
802
+ )
803
+ else:
804
+ # Response is not a range, but we want the start of the file,
805
+ # so we can read the required amount anyway.
806
+ cl = 0
807
+ out = []
808
+ for chunk in r.iter_content(2**20, False):
809
+ out.append(chunk)
810
+ cl += len(chunk)
811
+ out = b"".join(out)[: end - start]
812
+ return out
813
+
814
+
815
+ magic_check = re.compile("([*[])")
816
+
817
+
818
+ def has_magic(s):
819
+ match = magic_check.search(s)
820
+ return match is not None
821
+
822
+
823
+ class HTTPStreamFile(AbstractBufferedFile):
824
+ def __init__(self, fs, url, mode="rb", session=None, **kwargs):
825
+ self.url = url
826
+ self.session = session
827
+ if mode != "rb":
828
+ raise ValueError
829
+ self.details = {"name": url, "size": None}
830
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
831
+
832
+ r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
833
+ self.fs._raise_not_found_for_status(r, url)
834
+ self.it = r.iter_content(1024, False)
835
+ self.leftover = b""
836
+
837
+ self.r = r
838
+
839
+ def seek(self, *args, **kwargs):
840
+ raise ValueError("Cannot seek streaming HTTP file")
841
+
842
+ def read(self, num=-1):
843
+ bufs = [self.leftover]
844
+ leng = len(self.leftover)
845
+ while leng < num or num < 0:
846
+ try:
847
+ out = self.it.__next__()
848
+ except StopIteration:
849
+ break
850
+ if out:
851
+ bufs.append(out)
852
+ else:
853
+ break
854
+ leng += len(out)
855
+ out = b"".join(bufs)
856
+ if num >= 0:
857
+ self.leftover = out[num:]
858
+ out = out[:num]
859
+ else:
860
+ self.leftover = b""
861
+ self.loc += len(out)
862
+ return out
863
+
864
+ def close(self):
865
+ self.r.close()
866
+ self.closed = True
867
+
868
+
869
+ def get_range(session, url, start, end, **kwargs):
870
+ # explicit get a range when we know it must be safe
871
+ kwargs = kwargs.copy()
872
+ headers = kwargs.pop("headers", {}).copy()
873
+ headers["Range"] = f"bytes={start}-{end - 1}"
874
+ r = session.get(url, headers=headers, **kwargs)
875
+ r.raise_for_status()
876
+ return r.content
877
+
878
+
879
+ def _file_info(url, session, size_policy="head", **kwargs):
880
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
881
+
882
+ Default operation is to explicitly allow redirects and use encoding
883
+ 'identity' (no compression) to get the true size of the target.
884
+ """
885
+ logger.debug("Retrieve file size for %s", url)
886
+ kwargs = kwargs.copy()
887
+ ar = kwargs.pop("allow_redirects", True)
888
+ head = kwargs.get("headers", {}).copy()
889
+ # TODO: not allowed in JS
890
+ # head["Accept-Encoding"] = "identity"
891
+ kwargs["headers"] = head
892
+
893
+ info = {}
894
+ if size_policy == "head":
895
+ r = session.head(url, allow_redirects=ar, **kwargs)
896
+ elif size_policy == "get":
897
+ r = session.get(url, allow_redirects=ar, **kwargs)
898
+ else:
899
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
900
+ r.raise_for_status()
901
+
902
+ # TODO:
903
+ # recognise lack of 'Accept-Ranges',
904
+ # or 'Accept-Ranges': 'none' (not 'bytes')
905
+ # to mean streaming only, no random access => return None
906
+ if "Content-Length" in r.headers:
907
+ info["size"] = int(r.headers["Content-Length"])
908
+ elif "Content-Range" in r.headers:
909
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
910
+ elif "content-length" in r.headers:
911
+ info["size"] = int(r.headers["content-length"])
912
+ elif "content-range" in r.headers:
913
+ info["size"] = int(r.headers["content-range"].split("/")[1])
914
+
915
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
916
+ if r.headers.get(checksum_field):
917
+ info[checksum_field] = r.headers[checksum_field]
918
+
919
+ return info
920
+
921
+
922
+ # importing this is enough to register it
923
+ def register():
924
+ register_implementation("http", HTTPFileSystem, clobber=True)
925
+ register_implementation("https", HTTPFileSystem, clobber=True)
926
+ register_implementation("sync-http", HTTPFileSystem, clobber=True)
927
+ register_implementation("sync-https", HTTPFileSystem, clobber=True)
928
+
929
+
930
+ register()
931
+
932
+
933
+ def unregister():
934
+ from fsspec.implementations.http import HTTPFileSystem
935
+
936
+ register_implementation("http", HTTPFileSystem, clobber=True)
937
+ register_implementation("https", HTTPFileSystem, clobber=True)
venv/lib/python3.10/site-packages/fsspec/implementations/libarchive.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+ from ctypes import (
3
+ CFUNCTYPE,
4
+ POINTER,
5
+ c_int,
6
+ c_longlong,
7
+ c_void_p,
8
+ cast,
9
+ create_string_buffer,
10
+ )
11
+
12
+ import libarchive
13
+ import libarchive.ffi as ffi
14
+
15
+ from fsspec import open_files
16
+ from fsspec.archive import AbstractArchiveFileSystem
17
+ from fsspec.implementations.memory import MemoryFile
18
+ from fsspec.utils import DEFAULT_BLOCK_SIZE
19
+
20
+ # Libarchive requires seekable files or memory only for certain archive
21
+ # types. However, since we read the directory first to cache the contents
22
+ # and also allow random access to any file, the file-like object needs
23
+ # to be seekable no matter what.
24
+
25
+ # Seek call-backs (not provided in the libarchive python wrapper)
26
+ SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
27
+ read_set_seek_callback = ffi.ffi(
28
+ "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
29
+ )
30
+ new_api = hasattr(ffi, "NO_OPEN_CB")
31
+
32
+
33
+ @contextmanager
34
+ def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
35
+ """Read an archive from a seekable file-like object.
36
+
37
+ The `file` object must support the standard `readinto` and 'seek' methods.
38
+ """
39
+ buf = create_string_buffer(block_size)
40
+ buf_p = cast(buf, c_void_p)
41
+
42
+ def read_func(archive_p, context, ptrptr):
43
+ # readinto the buffer, returns number of bytes read
44
+ length = file.readinto(buf)
45
+ # write the address of the buffer into the pointer
46
+ ptrptr = cast(ptrptr, POINTER(c_void_p))
47
+ ptrptr[0] = buf_p
48
+ # tell libarchive how much data was written into the buffer
49
+ return length
50
+
51
+ def seek_func(archive_p, context, offset, whence):
52
+ file.seek(offset, whence)
53
+ # tell libarchvie the current position
54
+ return file.tell()
55
+
56
+ read_cb = ffi.READ_CALLBACK(read_func)
57
+ seek_cb = SEEK_CALLBACK(seek_func)
58
+
59
+ if new_api:
60
+ open_cb = ffi.NO_OPEN_CB
61
+ close_cb = ffi.NO_CLOSE_CB
62
+ else:
63
+ open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
64
+ close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
65
+
66
+ with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
67
+ read_set_seek_callback(archive_p, seek_cb)
68
+ ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
69
+ yield libarchive.read.ArchiveRead(archive_p)
70
+
71
+
72
+ class LibArchiveFileSystem(AbstractArchiveFileSystem):
73
+ """Compressed archives as a file-system (read-only)
74
+
75
+ Supports the following formats:
76
+ tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
77
+ Microsoft CAB, 7-Zip, WARC
78
+
79
+ See the libarchive documentation for further restrictions.
80
+ https://www.libarchive.org/
81
+
82
+ Keeps file object open while instance lives. It only works in seekable
83
+ file-like objects. In case the filesystem does not support this kind of
84
+ file object, it is recommended to cache locally.
85
+
86
+ This class is pickleable, but not necessarily thread-safe (depends on the
87
+ platform). See libarchive documentation for details.
88
+ """
89
+
90
+ root_marker = ""
91
+ protocol = "libarchive"
92
+ cachable = False
93
+
94
+ def __init__(
95
+ self,
96
+ fo="",
97
+ mode="r",
98
+ target_protocol=None,
99
+ target_options=None,
100
+ block_size=DEFAULT_BLOCK_SIZE,
101
+ **kwargs,
102
+ ):
103
+ """
104
+ Parameters
105
+ ----------
106
+ fo: str or file-like
107
+ Contains ZIP, and must exist. If a str, will fetch file using
108
+ :meth:`~fsspec.open_files`, which must return one file exactly.
109
+ mode: str
110
+ Currently, only 'r' accepted
111
+ target_protocol: str (optional)
112
+ If ``fo`` is a string, this value can be used to override the
113
+ FS protocol inferred from a URL
114
+ target_options: dict (optional)
115
+ Kwargs passed when instantiating the target FS, if ``fo`` is
116
+ a string.
117
+ """
118
+ super().__init__(self, **kwargs)
119
+ if mode != "r":
120
+ raise ValueError("Only read from archive files accepted")
121
+ if isinstance(fo, str):
122
+ files = open_files(fo, protocol=target_protocol, **(target_options or {}))
123
+ if len(files) != 1:
124
+ raise ValueError(
125
+ f'Path "{fo}" did not resolve to exactly one file: "{files}"'
126
+ )
127
+ fo = files[0]
128
+ self.of = fo
129
+ self.fo = fo.__enter__() # the whole instance is a context
130
+ self.block_size = block_size
131
+ self.dir_cache = None
132
+
133
+ @contextmanager
134
+ def _open_archive(self):
135
+ self.fo.seek(0)
136
+ with custom_reader(self.fo, block_size=self.block_size) as arc:
137
+ yield arc
138
+
139
+ @classmethod
140
+ def _strip_protocol(cls, path):
141
+ # file paths are always relative to the archive root
142
+ return super()._strip_protocol(path).lstrip("/")
143
+
144
+ def _get_dirs(self):
145
+ fields = {
146
+ "name": "pathname",
147
+ "size": "size",
148
+ "created": "ctime",
149
+ "mode": "mode",
150
+ "uid": "uid",
151
+ "gid": "gid",
152
+ "mtime": "mtime",
153
+ }
154
+
155
+ if self.dir_cache is not None:
156
+ return
157
+
158
+ self.dir_cache = {}
159
+ list_names = []
160
+ with self._open_archive() as arc:
161
+ for entry in arc:
162
+ if not entry.isdir and not entry.isfile:
163
+ # Skip symbolic links, fifo entries, etc.
164
+ continue
165
+ self.dir_cache.update(
166
+ {
167
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
168
+ for dirname in self._all_dirnames(set(entry.name))
169
+ }
170
+ )
171
+ f = {key: getattr(entry, fields[key]) for key in fields}
172
+ f["type"] = "directory" if entry.isdir else "file"
173
+ list_names.append(entry.name)
174
+
175
+ self.dir_cache[f["name"]] = f
176
+ # libarchive does not seem to return an entry for the directories (at least
177
+ # not in all formats), so get the directories names from the files names
178
+ self.dir_cache.update(
179
+ {
180
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
181
+ for dirname in self._all_dirnames(list_names)
182
+ }
183
+ )
184
+
185
+ def _open(
186
+ self,
187
+ path,
188
+ mode="rb",
189
+ block_size=None,
190
+ autocommit=True,
191
+ cache_options=None,
192
+ **kwargs,
193
+ ):
194
+ path = self._strip_protocol(path)
195
+ if mode != "rb":
196
+ raise NotImplementedError
197
+
198
+ data = b""
199
+ with self._open_archive() as arc:
200
+ for entry in arc:
201
+ if entry.pathname != path:
202
+ continue
203
+
204
+ if entry.size == 0:
205
+ # empty file, so there are no blocks
206
+ break
207
+
208
+ for block in entry.get_blocks(entry.size):
209
+ data = block
210
+ break
211
+ else:
212
+ raise ValueError
213
+ return MemoryFile(fs=self, path=path, data=data)
venv/lib/python3.10/site-packages/fsspec/parquet.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import warnings
4
+
5
+ import fsspec
6
+
7
+ from .core import url_to_fs
8
+ from .spec import AbstractBufferedFile
9
+ from .utils import merge_offset_ranges
10
+
11
+ # Parquet-Specific Utilities for fsspec
12
+ #
13
+ # Most of the functions defined in this module are NOT
14
+ # intended for public consumption. The only exception
15
+ # to this is `open_parquet_file`, which should be used
16
+ # place of `fs.open()` to open parquet-formatted files
17
+ # on remote file systems.
18
+
19
+
20
+ class AlreadyBufferedFile(AbstractBufferedFile):
21
+ def _fetch_range(self, start, end):
22
+ raise NotImplementedError
23
+
24
+
25
+ def open_parquet_files(
26
+ path: list[str],
27
+ fs: None | fsspec.AbstractFileSystem = None,
28
+ metadata=None,
29
+ columns: None | list[str] = None,
30
+ row_groups: None | list[int] = None,
31
+ storage_options: None | dict = None,
32
+ engine: str = "auto",
33
+ max_gap: int = 64_000,
34
+ max_block: int = 256_000_000,
35
+ footer_sample_size: int = 1_000_000,
36
+ filters: None | list[list[list[str]]] = None,
37
+ **kwargs,
38
+ ):
39
+ """
40
+ Return a file-like object for a single Parquet file.
41
+
42
+ The specified parquet `engine` will be used to parse the
43
+ footer metadata, and determine the required byte ranges
44
+ from the file. The target path will then be opened with
45
+ the "parts" (`KnownPartsOfAFile`) caching strategy.
46
+
47
+ Note that this method is intended for usage with remote
48
+ file systems, and is unlikely to improve parquet-read
49
+ performance on local file systems.
50
+
51
+ Parameters
52
+ ----------
53
+ path: str
54
+ Target file path.
55
+ metadata: Any, optional
56
+ Parquet metadata object. Object type must be supported
57
+ by the backend parquet engine. For now, only the "fastparquet"
58
+ engine supports an explicit `ParquetFile` metadata object.
59
+ If a metadata object is supplied, the remote footer metadata
60
+ will not need to be transferred into local memory.
61
+ fs: AbstractFileSystem, optional
62
+ Filesystem object to use for opening the file. If nothing is
63
+ specified, an `AbstractFileSystem` object will be inferred.
64
+ engine : str, default "auto"
65
+ Parquet engine to use for metadata parsing. Allowed options
66
+ include "fastparquet", "pyarrow", and "auto". The specified
67
+ engine must be installed in the current environment. If
68
+ "auto" is specified, and both engines are installed,
69
+ "fastparquet" will take precedence over "pyarrow".
70
+ columns: list, optional
71
+ List of all column names that may be read from the file.
72
+ row_groups : list, optional
73
+ List of all row-groups that may be read from the file. This
74
+ may be a list of row-group indices (integers), or it may be
75
+ a list of `RowGroup` metadata objects (if the "fastparquet"
76
+ engine is used).
77
+ storage_options : dict, optional
78
+ Used to generate an `AbstractFileSystem` object if `fs` was
79
+ not specified.
80
+ max_gap : int, optional
81
+ Neighboring byte ranges will only be merged when their
82
+ inter-range gap is <= `max_gap`. Default is 64KB.
83
+ max_block : int, optional
84
+ Neighboring byte ranges will only be merged when the size of
85
+ the aggregated range is <= `max_block`. Default is 256MB.
86
+ footer_sample_size : int, optional
87
+ Number of bytes to read from the end of the path to look
88
+ for the footer metadata. If the sampled bytes do not contain
89
+ the footer, a second read request will be required, and
90
+ performance will suffer. Default is 1MB.
91
+ filters : list[list], optional
92
+ List of filters to apply to prevent reading row groups, of the
93
+ same format as accepted by the loading engines. Ignored if
94
+ ``row_groups`` is specified.
95
+ **kwargs :
96
+ Optional key-word arguments to pass to `fs.open`
97
+ """
98
+
99
+ # Make sure we have an `AbstractFileSystem` object
100
+ # to work with
101
+ if fs is None:
102
+ path0 = path
103
+ if isinstance(path, (list, tuple)):
104
+ path = path[0]
105
+ fs, path = url_to_fs(path, **(storage_options or {}))
106
+ else:
107
+ path0 = path
108
+
109
+ # For now, `columns == []` not supported, is the same
110
+ # as all columns
111
+ if columns is not None and len(columns) == 0:
112
+ columns = None
113
+
114
+ # Set the engine
115
+ engine = _set_engine(engine)
116
+
117
+ if isinstance(path0, (list, tuple)):
118
+ paths = path0
119
+ elif "*" in path:
120
+ paths = fs.glob(path)
121
+ elif path0.endswith("/"): # or fs.isdir(path):
122
+ paths = [
123
+ _
124
+ for _ in fs.find(path, withdirs=False, detail=False)
125
+ if _.endswith((".parquet", ".parq"))
126
+ ]
127
+ else:
128
+ paths = [path]
129
+
130
+ data = _get_parquet_byte_ranges(
131
+ paths,
132
+ fs,
133
+ metadata=metadata,
134
+ columns=columns,
135
+ row_groups=row_groups,
136
+ engine=engine,
137
+ max_gap=max_gap,
138
+ max_block=max_block,
139
+ footer_sample_size=footer_sample_size,
140
+ filters=filters,
141
+ )
142
+
143
+ # Call self.open with "parts" caching
144
+ options = kwargs.pop("cache_options", {}).copy()
145
+ return [
146
+ AlreadyBufferedFile(
147
+ fs=None,
148
+ path=fn,
149
+ mode="rb",
150
+ cache_type="parts",
151
+ cache_options={
152
+ **options,
153
+ "data": ranges,
154
+ },
155
+ size=max(_[1] for _ in ranges),
156
+ **kwargs,
157
+ )
158
+ for fn, ranges in data.items()
159
+ ]
160
+
161
+
162
+ def open_parquet_file(*args, **kwargs):
163
+ """Create files tailed to reading specific parts of parquet files
164
+
165
+ Please see ``open_parquet_files`` for details of the arguments. The
166
+ difference is, this function always returns a single ``AleadyBufferedFile``,
167
+ whereas `open_parquet_files`` always returns a list of files, even if
168
+ there are one or zero matching parquet files.
169
+ """
170
+ return open_parquet_files(*args, **kwargs)[0]
171
+
172
+
173
+ def _get_parquet_byte_ranges(
174
+ paths,
175
+ fs,
176
+ metadata=None,
177
+ columns=None,
178
+ row_groups=None,
179
+ max_gap=64_000,
180
+ max_block=256_000_000,
181
+ footer_sample_size=1_000_000,
182
+ engine="auto",
183
+ filters=None,
184
+ ):
185
+ """Get a dictionary of the known byte ranges needed
186
+ to read a specific column/row-group selection from a
187
+ Parquet dataset. Each value in the output dictionary
188
+ is intended for use as the `data` argument for the
189
+ `KnownPartsOfAFile` caching strategy of a single path.
190
+ """
191
+
192
+ # Set engine if necessary
193
+ if isinstance(engine, str):
194
+ engine = _set_engine(engine)
195
+
196
+ # Pass to a specialized function if metadata is defined
197
+ if metadata is not None:
198
+ # Use the provided parquet metadata object
199
+ # to avoid transferring/parsing footer metadata
200
+ return _get_parquet_byte_ranges_from_metadata(
201
+ metadata,
202
+ fs,
203
+ engine,
204
+ columns=columns,
205
+ row_groups=row_groups,
206
+ max_gap=max_gap,
207
+ max_block=max_block,
208
+ filters=filters,
209
+ )
210
+
211
+ # Populate global paths, starts, & ends
212
+ if columns is None and row_groups is None and filters is None:
213
+ # We are NOT selecting specific columns or row-groups.
214
+ #
215
+ # We can avoid sampling the footers, and just transfer
216
+ # all file data with cat_ranges
217
+ result = {path: {(0, len(data)): data} for path, data in fs.cat(paths).items()}
218
+ else:
219
+ # We ARE selecting specific columns or row-groups.
220
+ #
221
+ # Get file sizes asynchronously
222
+ file_sizes = fs.sizes(paths)
223
+ data_paths = []
224
+ data_starts = []
225
+ data_ends = []
226
+ # Gather file footers.
227
+ # We just take the last `footer_sample_size` bytes of each
228
+ # file (or the entire file if it is smaller than that)
229
+ footer_starts = [
230
+ max(0, file_size - footer_sample_size) for file_size in file_sizes
231
+ ]
232
+ footer_samples = fs.cat_ranges(paths, footer_starts, file_sizes)
233
+
234
+ # Check our footer samples and re-sample if necessary.
235
+ large_footer = []
236
+ for i, path in enumerate(paths):
237
+ footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
238
+ real_footer_start = file_sizes[i] - (footer_size + 8)
239
+ if real_footer_start < footer_starts[i]:
240
+ large_footer.append((i, real_footer_start))
241
+ if large_footer:
242
+ warnings.warn(
243
+ f"Not enough data was used to sample the parquet footer. "
244
+ f"Try setting footer_sample_size >= {large_footer}."
245
+ )
246
+ path0 = [paths[i] for i, _ in large_footer]
247
+ starts = [_[1] for _ in large_footer]
248
+ ends = [file_sizes[i] - footer_sample_size for i, _ in large_footer]
249
+ data = fs.cat_ranges(path0, starts, ends)
250
+ for i, (path, start, block) in enumerate(zip(path0, starts, data)):
251
+ footer_samples[i] = block + footer_samples[i]
252
+ footer_starts[i] = start
253
+ result = {
254
+ path: {(start, size): data}
255
+ for path, start, size, data in zip(
256
+ paths, footer_starts, file_sizes, footer_samples
257
+ )
258
+ }
259
+
260
+ # Calculate required byte ranges for each path
261
+ for i, path in enumerate(paths):
262
+ # Use "engine" to collect data byte ranges
263
+ path_data_starts, path_data_ends = engine._parquet_byte_ranges(
264
+ columns,
265
+ row_groups=row_groups,
266
+ footer=footer_samples[i],
267
+ footer_start=footer_starts[i],
268
+ filters=filters,
269
+ )
270
+
271
+ data_paths += [path] * len(path_data_starts)
272
+ data_starts += path_data_starts
273
+ data_ends += path_data_ends
274
+
275
+ # Merge adjacent offset ranges
276
+ data_paths, data_starts, data_ends = merge_offset_ranges(
277
+ data_paths,
278
+ data_starts,
279
+ data_ends,
280
+ max_gap=max_gap,
281
+ max_block=max_block,
282
+ sort=True,
283
+ )
284
+
285
+ # Transfer the data byte-ranges into local memory
286
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
287
+
288
+ # Add b"PAR1" to headers
289
+ _add_header_magic(result)
290
+
291
+ return result
292
+
293
+
294
+ def _get_parquet_byte_ranges_from_metadata(
295
+ metadata,
296
+ fs,
297
+ engine,
298
+ columns=None,
299
+ row_groups=None,
300
+ max_gap=64_000,
301
+ max_block=256_000_000,
302
+ filters=None,
303
+ ):
304
+ """Simplified version of `_get_parquet_byte_ranges` for
305
+ the case that an engine-specific `metadata` object is
306
+ provided, and the remote footer metadata does not need to
307
+ be transferred before calculating the required byte ranges.
308
+ """
309
+
310
+ # Use "engine" to collect data byte ranges
311
+ data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
312
+ columns, row_groups=row_groups, metadata=metadata, filters=filters
313
+ )
314
+
315
+ # Merge adjacent offset ranges
316
+ data_paths, data_starts, data_ends = merge_offset_ranges(
317
+ data_paths,
318
+ data_starts,
319
+ data_ends,
320
+ max_gap=max_gap,
321
+ max_block=max_block,
322
+ sort=False, # Should be sorted
323
+ )
324
+
325
+ # Transfer the data byte-ranges into local memory
326
+ result = {fn: {} for fn in list(set(data_paths))}
327
+ _transfer_ranges(fs, result, data_paths, data_starts, data_ends)
328
+
329
+ # Add b"PAR1" to header
330
+ _add_header_magic(result)
331
+
332
+ return result
333
+
334
+
335
+ def _transfer_ranges(fs, blocks, paths, starts, ends):
336
+ # Use cat_ranges to gather the data byte_ranges
337
+ ranges = (paths, starts, ends)
338
+ for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
339
+ blocks[path][(start, stop)] = data
340
+
341
+
342
+ def _add_header_magic(data):
343
+ # Add b"PAR1" to file headers
344
+ for path in list(data):
345
+ add_magic = True
346
+ for k in data[path]:
347
+ if k[0] == 0 and k[1] >= 4:
348
+ add_magic = False
349
+ break
350
+ if add_magic:
351
+ data[path][(0, 4)] = b"PAR1"
352
+
353
+
354
+ def _set_engine(engine_str):
355
+ # Define a list of parquet engines to try
356
+ if engine_str == "auto":
357
+ try_engines = ("fastparquet", "pyarrow")
358
+ elif not isinstance(engine_str, str):
359
+ raise ValueError(
360
+ "Failed to set parquet engine! "
361
+ "Please pass 'fastparquet', 'pyarrow', or 'auto'"
362
+ )
363
+ elif engine_str not in ("fastparquet", "pyarrow"):
364
+ raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
365
+ else:
366
+ try_engines = [engine_str]
367
+
368
+ # Try importing the engines in `try_engines`,
369
+ # and choose the first one that succeeds
370
+ for engine in try_engines:
371
+ try:
372
+ if engine == "fastparquet":
373
+ return FastparquetEngine()
374
+ elif engine == "pyarrow":
375
+ return PyarrowEngine()
376
+ except ImportError:
377
+ pass
378
+
379
+ # Raise an error if a supported parquet engine
380
+ # was not found
381
+ raise ImportError(
382
+ f"The following parquet engines are not installed "
383
+ f"in your python environment: {try_engines}."
384
+ f"Please install 'fastparquert' or 'pyarrow' to "
385
+ f"utilize the `fsspec.parquet` module."
386
+ )
387
+
388
+
389
+ class FastparquetEngine:
390
+ # The purpose of the FastparquetEngine class is
391
+ # to check if fastparquet can be imported (on initialization)
392
+ # and to define a `_parquet_byte_ranges` method. In the
393
+ # future, this class may also be used to define other
394
+ # methods/logic that are specific to fastparquet.
395
+
396
+ def __init__(self):
397
+ import fastparquet as fp
398
+
399
+ self.fp = fp
400
+
401
+ def _parquet_byte_ranges(
402
+ self,
403
+ columns,
404
+ row_groups=None,
405
+ metadata=None,
406
+ footer=None,
407
+ footer_start=None,
408
+ filters=None,
409
+ ):
410
+ # Initialize offset ranges and define ParqetFile metadata
411
+ pf = metadata
412
+ data_paths, data_starts, data_ends = [], [], []
413
+ if filters and row_groups:
414
+ raise ValueError("filters and row_groups cannot be used together")
415
+ if pf is None:
416
+ pf = self.fp.ParquetFile(io.BytesIO(footer))
417
+
418
+ # Convert columns to a set and add any index columns
419
+ # specified in the pandas metadata (just in case)
420
+ column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
421
+ if column_set is not None and hasattr(pf, "pandas_metadata"):
422
+ md_index = [
423
+ ind
424
+ for ind in pf.pandas_metadata.get("index_columns", [])
425
+ # Ignore RangeIndex information
426
+ if not isinstance(ind, dict)
427
+ ]
428
+ column_set |= set(md_index)
429
+
430
+ # Check if row_groups is a list of integers
431
+ # or a list of row-group metadata
432
+ if filters:
433
+ from fastparquet.api import filter_row_groups
434
+
435
+ row_group_indices = None
436
+ row_groups = filter_row_groups(pf, filters)
437
+ elif row_groups and not isinstance(row_groups[0], int):
438
+ # Input row_groups contains row-group metadata
439
+ row_group_indices = None
440
+ else:
441
+ # Input row_groups contains row-group indices
442
+ row_group_indices = row_groups
443
+ row_groups = pf.row_groups
444
+ if column_set is not None:
445
+ column_set = [
446
+ _ if isinstance(_, list) else _.split(".") for _ in column_set
447
+ ]
448
+
449
+ # Loop through column chunks to add required byte ranges
450
+ for r, row_group in enumerate(row_groups):
451
+ # Skip this row-group if we are targeting
452
+ # specific row-groups
453
+ if row_group_indices is None or r in row_group_indices:
454
+ # Find the target parquet-file path for `row_group`
455
+ fn = pf.row_group_filename(row_group)
456
+
457
+ for column in row_group.columns:
458
+ name = column.meta_data.path_in_schema
459
+ # Skip this column if we are targeting specific columns
460
+ if column_set is None or _cmp(name, column_set):
461
+ file_offset0 = column.meta_data.dictionary_page_offset
462
+ if file_offset0 is None:
463
+ file_offset0 = column.meta_data.data_page_offset
464
+ num_bytes = column.meta_data.total_compressed_size
465
+ if footer_start is None or file_offset0 < footer_start:
466
+ data_paths.append(fn)
467
+ data_starts.append(file_offset0)
468
+ data_ends.append(
469
+ min(
470
+ file_offset0 + num_bytes,
471
+ footer_start or (file_offset0 + num_bytes),
472
+ )
473
+ )
474
+
475
+ if metadata:
476
+ # The metadata in this call may map to multiple
477
+ # file paths. Need to include `data_paths`
478
+ return data_paths, data_starts, data_ends
479
+ return data_starts, data_ends
480
+
481
+
482
+ class PyarrowEngine:
483
+ # The purpose of the PyarrowEngine class is
484
+ # to check if pyarrow can be imported (on initialization)
485
+ # and to define a `_parquet_byte_ranges` method. In the
486
+ # future, this class may also be used to define other
487
+ # methods/logic that are specific to pyarrow.
488
+
489
+ def __init__(self):
490
+ import pyarrow.parquet as pq
491
+
492
+ self.pq = pq
493
+
494
+ def _parquet_byte_ranges(
495
+ self,
496
+ columns,
497
+ row_groups=None,
498
+ metadata=None,
499
+ footer=None,
500
+ footer_start=None,
501
+ filters=None,
502
+ ):
503
+ if metadata is not None:
504
+ raise ValueError("metadata input not supported for PyarrowEngine")
505
+ if filters:
506
+ # there must be a way!
507
+ raise NotImplementedError
508
+
509
+ data_starts, data_ends = [], []
510
+ md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
511
+
512
+ # Convert columns to a set and add any index columns
513
+ # specified in the pandas metadata (just in case)
514
+ column_set = None if columns is None else set(columns)
515
+ if column_set is not None:
516
+ schema = md.schema.to_arrow_schema()
517
+ has_pandas_metadata = (
518
+ schema.metadata is not None and b"pandas" in schema.metadata
519
+ )
520
+ if has_pandas_metadata:
521
+ md_index = [
522
+ ind
523
+ for ind in json.loads(
524
+ schema.metadata[b"pandas"].decode("utf8")
525
+ ).get("index_columns", [])
526
+ # Ignore RangeIndex information
527
+ if not isinstance(ind, dict)
528
+ ]
529
+ column_set |= set(md_index)
530
+ if column_set is not None:
531
+ column_set = [
532
+ _[:1] if isinstance(_, list) else _.split(".")[:1] for _ in column_set
533
+ ]
534
+
535
+ # Loop through column chunks to add required byte ranges
536
+ for r in range(md.num_row_groups):
537
+ # Skip this row-group if we are targeting
538
+ # specific row-groups
539
+ if row_groups is None or r in row_groups:
540
+ row_group = md.row_group(r)
541
+ for c in range(row_group.num_columns):
542
+ column = row_group.column(c)
543
+ name = column.path_in_schema.split(".")
544
+ # Skip this column if we are targeting specific columns
545
+ if column_set is None or _cmp(name, column_set):
546
+ meta = column.to_dict()
547
+ # Any offset could be the first one
548
+ file_offset0 = min(
549
+ _
550
+ for _ in [
551
+ meta.get("dictionary_page_offset"),
552
+ meta.get("data_page_offset"),
553
+ meta.get("index_page_offset"),
554
+ ]
555
+ if _ is not None
556
+ )
557
+ if file_offset0 < footer_start:
558
+ data_starts.append(file_offset0)
559
+ data_ends.append(
560
+ min(
561
+ meta["total_compressed_size"] + file_offset0,
562
+ footer_start,
563
+ )
564
+ )
565
+
566
+ data_starts.append(footer_start)
567
+ data_ends.append(footer_start + len(footer))
568
+ return data_starts, data_ends
569
+
570
+
571
+ def _cmp(name, column_set):
572
+ return any(all(a == b for a, b in zip(name, _)) for _ in column_set)
venv/lib/python3.10/site-packages/fsspec/registry.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import types
5
+ import warnings
6
+
7
+ __all__ = ["registry", "get_filesystem_class", "default"]
8
+
9
+ # internal, mutable
10
+ _registry: dict[str, type] = {}
11
+
12
+ # external, immutable
13
+ registry = types.MappingProxyType(_registry)
14
+ default = "file"
15
+
16
+
17
+ def register_implementation(name, cls, clobber=False, errtxt=None):
18
+ """Add implementation class to the registry
19
+
20
+ Parameters
21
+ ----------
22
+ name: str
23
+ Protocol name to associate with the class
24
+ cls: class or str
25
+ if a class: fsspec-compliant implementation class (normally inherits from
26
+ ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
27
+ str, the full path to an implementation class like package.module.class,
28
+ which gets added to known_implementations,
29
+ so the import is deferred until the filesystem is actually used.
30
+ clobber: bool (optional)
31
+ Whether to overwrite a protocol with the same name; if False, will raise
32
+ instead.
33
+ errtxt: str (optional)
34
+ If given, then a failure to import the given class will result in this
35
+ text being given.
36
+ """
37
+ if isinstance(cls, str):
38
+ if name in known_implementations and clobber is False:
39
+ if cls != known_implementations[name]["class"]:
40
+ raise ValueError(
41
+ f"Name ({name}) already in the known_implementations and clobber "
42
+ f"is False"
43
+ )
44
+ else:
45
+ known_implementations[name] = {
46
+ "class": cls,
47
+ "err": errtxt or f"{cls} import failed for protocol {name}",
48
+ }
49
+
50
+ else:
51
+ if name in registry and clobber is False:
52
+ if _registry[name] is not cls:
53
+ raise ValueError(
54
+ f"Name ({name}) already in the registry and clobber is False"
55
+ )
56
+ else:
57
+ _registry[name] = cls
58
+
59
+
60
+ # protocols mapped to the class which implements them. This dict can be
61
+ # updated with register_implementation
62
+ known_implementations = {
63
+ "abfs": {
64
+ "class": "adlfs.AzureBlobFileSystem",
65
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
66
+ },
67
+ "adl": {
68
+ "class": "adlfs.AzureDatalakeFileSystem",
69
+ "err": "Install adlfs to access Azure Datalake Gen1",
70
+ },
71
+ "arrow_hdfs": {
72
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
73
+ "err": "pyarrow and local java libraries required for HDFS",
74
+ },
75
+ "async_wrapper": {
76
+ "class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
77
+ },
78
+ "asynclocal": {
79
+ "class": "morefs.asyn_local.AsyncLocalFileSystem",
80
+ "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
81
+ },
82
+ "asyncwrapper": {
83
+ "class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
84
+ },
85
+ "az": {
86
+ "class": "adlfs.AzureBlobFileSystem",
87
+ "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
88
+ },
89
+ "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
90
+ "box": {
91
+ "class": "boxfs.BoxFileSystem",
92
+ "err": "Please install boxfs to access BoxFileSystem",
93
+ },
94
+ "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
95
+ "dask": {
96
+ "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
97
+ "err": "Install dask distributed to access worker file system",
98
+ },
99
+ "data": {"class": "fsspec.implementations.data.DataFileSystem"},
100
+ "dbfs": {
101
+ "class": "fsspec.implementations.dbfs.DatabricksFileSystem",
102
+ "err": "Install the requests package to use the DatabricksFileSystem",
103
+ },
104
+ "dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
105
+ "dropbox": {
106
+ "class": "dropboxdrivefs.DropboxDriveFileSystem",
107
+ "err": (
108
+ 'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
109
+ '"dropbox" to be installed'
110
+ ),
111
+ },
112
+ "dvc": {
113
+ "class": "dvc.api.DVCFileSystem",
114
+ "err": "Install dvc to access DVCFileSystem",
115
+ },
116
+ "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
117
+ "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
118
+ "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
119
+ "gcs": {
120
+ "class": "gcsfs.GCSFileSystem",
121
+ "err": "Please install gcsfs to access Google Storage",
122
+ },
123
+ "gdrive": {
124
+ "class": "gdrive_fsspec.GoogleDriveFileSystem",
125
+ "err": "Please install gdrive_fs for access to Google Drive",
126
+ },
127
+ "generic": {"class": "fsspec.generic.GenericFileSystem"},
128
+ "gist": {
129
+ "class": "fsspec.implementations.gist.GistFileSystem",
130
+ "err": "Install the requests package to use the gist FS",
131
+ },
132
+ "git": {
133
+ "class": "fsspec.implementations.git.GitFileSystem",
134
+ "err": "Install pygit2 to browse local git repos",
135
+ },
136
+ "github": {
137
+ "class": "fsspec.implementations.github.GithubFileSystem",
138
+ "err": "Install the requests package to use the github FS",
139
+ },
140
+ "gs": {
141
+ "class": "gcsfs.GCSFileSystem",
142
+ "err": "Please install gcsfs to access Google Storage",
143
+ },
144
+ "hdfs": {
145
+ "class": "fsspec.implementations.arrow.HadoopFileSystem",
146
+ "err": "pyarrow and local java libraries required for HDFS",
147
+ },
148
+ "hf": {
149
+ "class": "huggingface_hub.HfFileSystem",
150
+ "err": "Install huggingface_hub to access HfFileSystem",
151
+ },
152
+ "http": {
153
+ "class": "fsspec.implementations.http.HTTPFileSystem",
154
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
155
+ },
156
+ "https": {
157
+ "class": "fsspec.implementations.http.HTTPFileSystem",
158
+ "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
159
+ },
160
+ "jlab": {
161
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
162
+ "err": "Jupyter FS requires requests to be installed",
163
+ },
164
+ "jupyter": {
165
+ "class": "fsspec.implementations.jupyter.JupyterFileSystem",
166
+ "err": "Jupyter FS requires requests to be installed",
167
+ },
168
+ "lakefs": {
169
+ "class": "lakefs_spec.LakeFSFileSystem",
170
+ "err": "Please install lakefs-spec to access LakeFSFileSystem",
171
+ },
172
+ "libarchive": {
173
+ "class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
174
+ "err": "LibArchive requires to be installed",
175
+ },
176
+ "local": {"class": "fsspec.implementations.local.LocalFileSystem"},
177
+ "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
178
+ "oci": {
179
+ "class": "ocifs.OCIFileSystem",
180
+ "err": "Install ocifs to access OCI Object Storage",
181
+ },
182
+ "ocilake": {
183
+ "class": "ocifs.OCIFileSystem",
184
+ "err": "Install ocifs to access OCI Data Lake",
185
+ },
186
+ "oss": {
187
+ "class": "ossfs.OSSFileSystem",
188
+ "err": "Install ossfs to access Alibaba Object Storage System",
189
+ },
190
+ "pyscript": {
191
+ "class": "pyscript_fsspec_client.client.PyscriptFileSystem",
192
+ "err": "This only runs in a pyscript context",
193
+ },
194
+ "reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
195
+ "root": {
196
+ "class": "fsspec_xrootd.XRootDFileSystem",
197
+ "err": (
198
+ "Install fsspec-xrootd to access xrootd storage system. "
199
+ "Note: 'root' is the protocol name for xrootd storage systems, "
200
+ "not referring to root directories"
201
+ ),
202
+ },
203
+ "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
204
+ "s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
205
+ "sftp": {
206
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
207
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
208
+ },
209
+ "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
210
+ "smb": {
211
+ "class": "fsspec.implementations.smb.SMBFileSystem",
212
+ "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
213
+ },
214
+ "ssh": {
215
+ "class": "fsspec.implementations.sftp.SFTPFileSystem",
216
+ "err": 'SFTPFileSystem requires "paramiko" to be installed',
217
+ },
218
+ "tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
219
+ "tos": {
220
+ "class": "tosfs.TosFileSystem",
221
+ "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
222
+ },
223
+ "tosfs": {
224
+ "class": "tosfs.TosFileSystem",
225
+ "err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
226
+ },
227
+ "wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
228
+ "webdav": {
229
+ "class": "webdav4.fsspec.WebdavFileSystem",
230
+ "err": "Install webdav4 to access WebDAV",
231
+ },
232
+ "webhdfs": {
233
+ "class": "fsspec.implementations.webhdfs.WebHDFS",
234
+ "err": 'webHDFS access requires "requests" to be installed',
235
+ },
236
+ "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
237
+ }
238
+
239
+ assert list(known_implementations) == sorted(known_implementations), (
240
+ "Not in alphabetical order"
241
+ )
242
+
243
+
244
+ def get_filesystem_class(protocol):
245
+ """Fetch named protocol implementation from the registry
246
+
247
+ The dict ``known_implementations`` maps protocol names to the locations
248
+ of classes implementing the corresponding file-system. When used for the
249
+ first time, appropriate imports will happen and the class will be placed in
250
+ the registry. All subsequent calls will fetch directly from the registry.
251
+
252
+ Some protocol implementations require additional dependencies, and so the
253
+ import may fail. In this case, the string in the "err" field of the
254
+ ``known_implementations`` will be given as the error message.
255
+ """
256
+ if not protocol:
257
+ protocol = default
258
+
259
+ if protocol not in registry:
260
+ if protocol not in known_implementations:
261
+ raise ValueError(f"Protocol not known: {protocol}")
262
+ bit = known_implementations[protocol]
263
+ try:
264
+ register_implementation(protocol, _import_class(bit["class"]))
265
+ except ImportError as e:
266
+ raise ImportError(bit.get("err")) from e
267
+ cls = registry[protocol]
268
+ if getattr(cls, "protocol", None) in ("abstract", None):
269
+ cls.protocol = protocol
270
+
271
+ return cls
272
+
273
+
274
+ s3_msg = """Your installed version of s3fs is very old and known to cause
275
+ severe performance issues, see also https://github.com/dask/dask/issues/10276
276
+
277
+ To fix, you should specify a lower version bound on s3fs, or
278
+ update the current installation.
279
+ """
280
+
281
+
282
+ def _import_class(fqp: str):
283
+ """Take a fully-qualified path and return the imported class or identifier.
284
+
285
+ ``fqp`` is of the form "package.module.klass" or
286
+ "package.module:subobject.klass".
287
+
288
+ Warnings
289
+ --------
290
+ This can import arbitrary modules. Make sure you haven't installed any modules
291
+ that may execute malicious code at import time.
292
+ """
293
+ if ":" in fqp:
294
+ mod, name = fqp.rsplit(":", 1)
295
+ else:
296
+ mod, name = fqp.rsplit(".", 1)
297
+
298
+ is_s3 = mod == "s3fs"
299
+ mod = importlib.import_module(mod)
300
+ if is_s3 and mod.__version__.split(".") < ["0", "5"]:
301
+ warnings.warn(s3_msg)
302
+ for part in name.split("."):
303
+ mod = getattr(mod, part)
304
+
305
+ if not isinstance(mod, type):
306
+ raise TypeError(f"{fqp} is not a class")
307
+
308
+ return mod
309
+
310
+
311
+ def filesystem(protocol, **storage_options):
312
+ """Instantiate filesystems for given protocol and arguments
313
+
314
+ ``storage_options`` are specific to the protocol being chosen, and are
315
+ passed directly to the class.
316
+ """
317
+ if protocol == "arrow_hdfs":
318
+ warnings.warn(
319
+ "The 'arrow_hdfs' protocol has been deprecated and will be "
320
+ "removed in the future. Specify it as 'hdfs'.",
321
+ DeprecationWarning,
322
+ )
323
+
324
+ cls = get_filesystem_class(protocol)
325
+ return cls(**storage_options)
326
+
327
+
328
+ def available_protocols():
329
+ """Return a list of the implemented protocols.
330
+
331
+ Note that any given protocol may require extra packages to be importable.
332
+ """
333
+ return list(known_implementations)
venv/lib/python3.10/site-packages/fsspec/spec.py ADDED
@@ -0,0 +1,2281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import threading
8
+ import warnings
9
+ import weakref
10
+ from errno import ESPIPE
11
+ from glob import has_magic
12
+ from hashlib import sha256
13
+ from typing import Any, ClassVar
14
+
15
+ from .callbacks import DEFAULT_CALLBACK
16
+ from .config import apply_config, conf
17
+ from .dircache import DirCache
18
+ from .transaction import Transaction
19
+ from .utils import (
20
+ _unstrip_protocol,
21
+ glob_translate,
22
+ isfilelike,
23
+ other_paths,
24
+ read_block,
25
+ stringify_path,
26
+ tokenize,
27
+ )
28
+
29
+ logger = logging.getLogger("fsspec")
30
+
31
+
32
+ def make_instance(cls, args, kwargs):
33
+ return cls(*args, **kwargs)
34
+
35
+
36
+ class _Cached(type):
37
+ """
38
+ Metaclass for caching file system instances.
39
+
40
+ Notes
41
+ -----
42
+ Instances are cached according to
43
+
44
+ * The values of the class attributes listed in `_extra_tokenize_attributes`
45
+ * The arguments passed to ``__init__``.
46
+
47
+ This creates an additional reference to the filesystem, which prevents the
48
+ filesystem from being garbage collected when all *user* references go away.
49
+ A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
50
+ be made for a filesystem instance to be garbage collected.
51
+ """
52
+
53
+ def __init__(cls, *args, **kwargs):
54
+ super().__init__(*args, **kwargs)
55
+ # Note: we intentionally create a reference here, to avoid garbage
56
+ # collecting instances when all other references are gone. To really
57
+ # delete a FileSystem, the cache must be cleared.
58
+ if conf.get("weakref_instance_cache"): # pragma: no cover
59
+ # debug option for analysing fork/spawn conditions
60
+ cls._cache = weakref.WeakValueDictionary()
61
+ else:
62
+ cls._cache = {}
63
+ cls._pid = os.getpid()
64
+
65
+ def __call__(cls, *args, **kwargs):
66
+ kwargs = apply_config(cls, kwargs)
67
+ extra_tokens = tuple(
68
+ getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
69
+ )
70
+ strip_tokenize_options = {
71
+ k: kwargs.pop(k) for k in cls._strip_tokenize_options if k in kwargs
72
+ }
73
+ token = tokenize(
74
+ cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
75
+ )
76
+ skip = kwargs.pop("skip_instance_cache", False)
77
+ if os.getpid() != cls._pid:
78
+ cls._cache.clear()
79
+ cls._pid = os.getpid()
80
+ if not skip and cls.cachable and token in cls._cache:
81
+ cls._latest = token
82
+ return cls._cache[token]
83
+ else:
84
+ obj = super().__call__(*args, **kwargs, **strip_tokenize_options)
85
+ # Setting _fs_token here causes some static linters to complain.
86
+ obj._fs_token_ = token
87
+ obj.storage_args = args
88
+ obj.storage_options = kwargs
89
+ if obj.async_impl and obj.mirror_sync_methods:
90
+ from .asyn import mirror_sync_methods
91
+
92
+ mirror_sync_methods(obj)
93
+
94
+ if cls.cachable and not skip:
95
+ cls._latest = token
96
+ cls._cache[token] = obj
97
+ return obj
98
+
99
+
100
+ class AbstractFileSystem(metaclass=_Cached):
101
+ """
102
+ An abstract super-class for pythonic file-systems
103
+
104
+ Implementations are expected to be compatible with or, better, subclass
105
+ from here.
106
+ """
107
+
108
+ cachable = True # this class can be cached, instances reused
109
+ _cached = False
110
+ blocksize = 2**22
111
+ sep = "/"
112
+ protocol: ClassVar[str | tuple[str, ...]] = "abstract"
113
+ _latest = None
114
+ async_impl = False
115
+ mirror_sync_methods = False
116
+ root_marker = "" # For some FSs, may require leading '/' or other character
117
+ transaction_type = Transaction
118
+
119
+ #: Extra *class attributes* that should be considered when hashing.
120
+ _extra_tokenize_attributes = ()
121
+ #: *storage options* that should not be considered when hashing.
122
+ _strip_tokenize_options = ()
123
+
124
+ # Set by _Cached metaclass
125
+ storage_args: tuple[Any, ...]
126
+ storage_options: dict[str, Any]
127
+
128
+ def __init__(self, *args, **storage_options):
129
+ """Create and configure file-system instance
130
+
131
+ Instances may be cachable, so if similar enough arguments are seen
132
+ a new instance is not required. The token attribute exists to allow
133
+ implementations to cache instances if they wish.
134
+
135
+ A reasonable default should be provided if there are no arguments.
136
+
137
+ Subclasses should call this method.
138
+
139
+ Parameters
140
+ ----------
141
+ use_listings_cache, listings_expiry_time, max_paths:
142
+ passed to ``DirCache``, if the implementation supports
143
+ directory listing caching. Pass use_listings_cache=False
144
+ to disable such caching.
145
+ skip_instance_cache: bool
146
+ If this is a cachable implementation, pass True here to force
147
+ creating a new instance even if a matching instance exists, and prevent
148
+ storing this instance.
149
+ asynchronous: bool
150
+ loop: asyncio-compatible IOLoop or None
151
+ """
152
+ if self._cached:
153
+ # reusing instance, don't change
154
+ return
155
+ self._cached = True
156
+ self._intrans = False
157
+ self._transaction = None
158
+ self._invalidated_caches_in_transaction = []
159
+ self.dircache = DirCache(**storage_options)
160
+
161
+ if storage_options.pop("add_docs", None):
162
+ warnings.warn("add_docs is no longer supported.", FutureWarning)
163
+
164
+ if storage_options.pop("add_aliases", None):
165
+ warnings.warn("add_aliases has been removed.", FutureWarning)
166
+ # This is set in _Cached
167
+ self._fs_token_ = None
168
+
169
+ @property
170
+ def fsid(self):
171
+ """Persistent filesystem id that can be used to compare filesystems
172
+ across sessions.
173
+ """
174
+ raise NotImplementedError
175
+
176
+ @property
177
+ def _fs_token(self):
178
+ return self._fs_token_
179
+
180
+ def __dask_tokenize__(self):
181
+ return self._fs_token
182
+
183
+ def __hash__(self):
184
+ return int(self._fs_token, 16)
185
+
186
+ def __eq__(self, other):
187
+ return isinstance(other, type(self)) and self._fs_token == other._fs_token
188
+
189
+ def __reduce__(self):
190
+ return make_instance, (type(self), self.storage_args, self.storage_options)
191
+
192
+ @classmethod
193
+ def _strip_protocol(cls, path):
194
+ """Turn path from fully-qualified to file-system-specific
195
+
196
+ May require FS-specific handling, e.g., for relative paths or links.
197
+ """
198
+ if isinstance(path, list):
199
+ return [cls._strip_protocol(p) for p in path]
200
+ path = stringify_path(path)
201
+ protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
202
+ for protocol in protos:
203
+ if path.startswith(protocol + "://"):
204
+ path = path[len(protocol) + 3 :]
205
+ elif path.startswith(protocol + "::"):
206
+ path = path[len(protocol) + 2 :]
207
+ path = path.rstrip("/")
208
+ # use of root_marker to make minimum required path, e.g., "/"
209
+ return path or cls.root_marker
210
+
211
+ def unstrip_protocol(self, name: str) -> str:
212
+ """Format FS-specific path to generic, including protocol"""
213
+ protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
214
+ for protocol in protos:
215
+ if name.startswith(f"{protocol}://"):
216
+ return name
217
+ return f"{protos[0]}://{name}"
218
+
219
+ @staticmethod
220
+ def _get_kwargs_from_urls(path):
221
+ """If kwargs can be encoded in the paths, extract them here
222
+
223
+ This should happen before instantiation of the class; incoming paths
224
+ then should be amended to strip the options in methods.
225
+
226
+ Examples may look like an sftp path "sftp://user@host:/my/path", where
227
+ the user and host should become kwargs and later get stripped.
228
+ """
229
+ # by default, nothing happens
230
+ return {}
231
+
232
+ @classmethod
233
+ def current(cls):
234
+ """Return the most recently instantiated FileSystem
235
+
236
+ If no instance has been created, then create one with defaults
237
+ """
238
+ if cls._latest in cls._cache:
239
+ return cls._cache[cls._latest]
240
+ return cls()
241
+
242
+ @property
243
+ def transaction(self):
244
+ """A context within which files are committed together upon exit
245
+
246
+ Requires the file class to implement `.commit()` and `.discard()`
247
+ for the normal and exception cases.
248
+ """
249
+ if self._transaction is None:
250
+ self._transaction = self.transaction_type(self)
251
+ return self._transaction
252
+
253
+ def start_transaction(self):
254
+ """Begin write transaction for deferring files, non-context version"""
255
+ self._intrans = True
256
+ self._transaction = self.transaction_type(self)
257
+ return self.transaction
258
+
259
+ def end_transaction(self):
260
+ """Finish write transaction, non-context version"""
261
+ self.transaction.complete()
262
+ self._transaction = None
263
+ # The invalid cache must be cleared after the transaction is completed.
264
+ for path in self._invalidated_caches_in_transaction:
265
+ self.invalidate_cache(path)
266
+ self._invalidated_caches_in_transaction.clear()
267
+
268
+ def invalidate_cache(self, path=None):
269
+ """
270
+ Discard any cached directory information
271
+
272
+ Parameters
273
+ ----------
274
+ path: string or None
275
+ If None, clear all listings cached else listings at or under given
276
+ path.
277
+ """
278
+ # Not necessary to implement invalidation mechanism, may have no cache.
279
+ # But if have, you should call this method of parent class from your
280
+ # subclass to ensure expiring caches after transacations correctly.
281
+ # See the implementation of FTPFileSystem in ftp.py
282
+ if self._intrans:
283
+ self._invalidated_caches_in_transaction.append(path)
284
+
285
+ def mkdir(self, path, create_parents=True, **kwargs):
286
+ """
287
+ Create directory entry at path
288
+
289
+ For systems that don't have true directories, may create an for
290
+ this instance only and not touch the real filesystem
291
+
292
+ Parameters
293
+ ----------
294
+ path: str
295
+ location
296
+ create_parents: bool
297
+ if True, this is equivalent to ``makedirs``
298
+ kwargs:
299
+ may be permissions, etc.
300
+ """
301
+ pass # not necessary to implement, may not have directories
302
+
303
+ def makedirs(self, path, exist_ok=False):
304
+ """Recursively make directories
305
+
306
+ Creates directory at path and any intervening required directories.
307
+ Raises exception if, for instance, the path already exists but is a
308
+ file.
309
+
310
+ Parameters
311
+ ----------
312
+ path: str
313
+ leaf directory name
314
+ exist_ok: bool (False)
315
+ If False, will error if the target already exists
316
+ """
317
+ pass # not necessary to implement, may not have directories
318
+
319
+ def rmdir(self, path):
320
+ """Remove a directory, if empty"""
321
+ pass # not necessary to implement, may not have directories
322
+
323
+ def ls(self, path, detail=True, **kwargs):
324
+ """List objects at path.
325
+
326
+ This should include subdirectories and files at that location. The
327
+ difference between a file and a directory must be clear when details
328
+ are requested.
329
+
330
+ The specific keys, or perhaps a FileInfo class, or similar, is TBD,
331
+ but must be consistent across implementations.
332
+ Must include:
333
+
334
+ - full path to the entry (without protocol)
335
+ - size of the entry, in bytes. If the value cannot be determined, will
336
+ be ``None``.
337
+ - type of entry, "file", "directory" or other
338
+
339
+ Additional information
340
+ may be present, appropriate to the file-system, e.g., generation,
341
+ checksum, etc.
342
+
343
+ May use refresh=True|False to allow use of self._ls_from_cache to
344
+ check for a saved listing and avoid calling the backend. This would be
345
+ common where listing may be expensive.
346
+
347
+ Parameters
348
+ ----------
349
+ path: str
350
+ detail: bool
351
+ if True, gives a list of dictionaries, where each is the same as
352
+ the result of ``info(path)``. If False, gives a list of paths
353
+ (str).
354
+ kwargs: may have additional backend-specific options, such as version
355
+ information
356
+
357
+ Returns
358
+ -------
359
+ List of strings if detail is False, or list of directory information
360
+ dicts if detail is True.
361
+ """
362
+ raise NotImplementedError
363
+
364
+ def _ls_from_cache(self, path):
365
+ """Check cache for listing
366
+
367
+ Returns listing, if found (may be empty list for a directly that exists
368
+ but contains nothing), None if not in cache.
369
+ """
370
+ parent = self._parent(path)
371
+ try:
372
+ return self.dircache[path.rstrip("/")]
373
+ except KeyError:
374
+ pass
375
+ try:
376
+ files = [
377
+ f
378
+ for f in self.dircache[parent]
379
+ if f["name"] == path
380
+ or (f["name"] == path.rstrip("/") and f["type"] == "directory")
381
+ ]
382
+ if len(files) == 0:
383
+ # parent dir was listed but did not contain this file
384
+ raise FileNotFoundError(path)
385
+ return files
386
+ except KeyError:
387
+ pass
388
+
389
+ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
390
+ """Return all files under the given path.
391
+
392
+ List all files, recursing into subdirectories; output is iterator-style,
393
+ like ``os.walk()``. For a simple list of files, ``find()`` is available.
394
+
395
+ When topdown is True, the caller can modify the dirnames list in-place (perhaps
396
+ using del or slice assignment), and walk() will
397
+ only recurse into the subdirectories whose names remain in dirnames;
398
+ this can be used to prune the search, impose a specific order of visiting,
399
+ or even to inform walk() about directories the caller creates or renames before
400
+ it resumes walk() again.
401
+ Modifying dirnames when topdown is False has no effect. (see os.walk)
402
+
403
+ Note that the "files" outputted will include anything that is not
404
+ a directory, such as links.
405
+
406
+ Parameters
407
+ ----------
408
+ path: str
409
+ Root to recurse into
410
+ maxdepth: int
411
+ Maximum recursion depth. None means limitless, but not recommended
412
+ on link-based file-systems.
413
+ topdown: bool (True)
414
+ Whether to walk the directory tree from the top downwards or from
415
+ the bottom upwards.
416
+ on_error: "omit", "raise", a callable
417
+ if omit (default), path with exception will simply be empty;
418
+ If raise, an underlying exception will be raised;
419
+ if callable, it will be called with a single OSError instance as argument
420
+ kwargs: passed to ``ls``
421
+ """
422
+ if maxdepth is not None and maxdepth < 1:
423
+ raise ValueError("maxdepth must be at least 1")
424
+
425
+ path = self._strip_protocol(path)
426
+ full_dirs = {}
427
+ dirs = {}
428
+ files = {}
429
+
430
+ detail = kwargs.pop("detail", False)
431
+ try:
432
+ listing = self.ls(path, detail=True, **kwargs)
433
+ except (FileNotFoundError, OSError) as e:
434
+ if on_error == "raise":
435
+ raise
436
+ if callable(on_error):
437
+ on_error(e)
438
+ return
439
+
440
+ for info in listing:
441
+ # each info name must be at least [path]/part , but here
442
+ # we check also for names like [path]/part/
443
+ pathname = info["name"].rstrip("/")
444
+ name = pathname.rsplit("/", 1)[-1]
445
+ if info["type"] == "directory" and pathname != path:
446
+ # do not include "self" path
447
+ full_dirs[name] = pathname
448
+ dirs[name] = info
449
+ elif pathname == path:
450
+ # file-like with same name as give path
451
+ files[""] = info
452
+ else:
453
+ files[name] = info
454
+
455
+ if not detail:
456
+ dirs = list(dirs)
457
+ files = list(files)
458
+
459
+ if topdown:
460
+ # Yield before recursion if walking top down
461
+ yield path, dirs, files
462
+
463
+ if maxdepth is not None:
464
+ maxdepth -= 1
465
+ if maxdepth < 1:
466
+ if not topdown:
467
+ yield path, dirs, files
468
+ return
469
+
470
+ for d in dirs:
471
+ yield from self.walk(
472
+ full_dirs[d],
473
+ maxdepth=maxdepth,
474
+ detail=detail,
475
+ topdown=topdown,
476
+ **kwargs,
477
+ )
478
+
479
+ if not topdown:
480
+ # Yield after recursion if walking bottom up
481
+ yield path, dirs, files
482
+
483
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
484
+ """List all files below path.
485
+
486
+ Like posix ``find`` command without conditions
487
+
488
+ Parameters
489
+ ----------
490
+ path : str
491
+ maxdepth: int or None
492
+ If not None, the maximum number of levels to descend
493
+ withdirs: bool
494
+ Whether to include directory paths in the output. This is True
495
+ when used by glob, but users usually only want files.
496
+ kwargs are passed to ``ls``.
497
+ """
498
+ # TODO: allow equivalent of -name parameter
499
+ path = self._strip_protocol(path)
500
+ out = {}
501
+
502
+ # Add the root directory if withdirs is requested
503
+ # This is needed for posix glob compliance
504
+ if withdirs and path != "" and self.isdir(path):
505
+ out[path] = self.info(path)
506
+
507
+ for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
508
+ if withdirs:
509
+ files.update(dirs)
510
+ out.update({info["name"]: info for name, info in files.items()})
511
+ if not out and self.isfile(path):
512
+ # walk works on directories, but find should also return [path]
513
+ # when path happens to be a file
514
+ out[path] = {}
515
+ names = sorted(out)
516
+ if not detail:
517
+ return names
518
+ else:
519
+ return {name: out[name] for name in names}
520
+
521
+ def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
522
+ """Space used by files and optionally directories within a path
523
+
524
+ Directory size does not include the size of its contents.
525
+
526
+ Parameters
527
+ ----------
528
+ path: str
529
+ total: bool
530
+ Whether to sum all the file sizes
531
+ maxdepth: int or None
532
+ Maximum number of directory levels to descend, None for unlimited.
533
+ withdirs: bool
534
+ Whether to include directory paths in the output.
535
+ kwargs: passed to ``find``
536
+
537
+ Returns
538
+ -------
539
+ Dict of {path: size} if total=False, or int otherwise, where numbers
540
+ refer to bytes used.
541
+ """
542
+ sizes = {}
543
+ if withdirs and self.isdir(path):
544
+ # Include top-level directory in output
545
+ info = self.info(path)
546
+ sizes[info["name"]] = info["size"]
547
+ for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
548
+ info = self.info(f)
549
+ sizes[info["name"]] = info["size"]
550
+ if total:
551
+ return sum(sizes.values())
552
+ else:
553
+ return sizes
554
+
555
+ def glob(self, path, maxdepth=None, **kwargs):
556
+ """Find files by glob-matching.
557
+
558
+ Pattern matching capabilities for finding files that match the given pattern.
559
+
560
+ Parameters
561
+ ----------
562
+ path: str
563
+ The glob pattern to match against
564
+ maxdepth: int or None
565
+ Maximum depth for ``'**'`` patterns. Applied on the first ``'**'`` found.
566
+ Must be at least 1 if provided.
567
+ kwargs:
568
+ Additional arguments passed to ``find`` (e.g., detail=True)
569
+
570
+ Returns
571
+ -------
572
+ List of matched paths, or dict of paths and their info if detail=True
573
+
574
+ Notes
575
+ -----
576
+ Supported patterns:
577
+ - '*': Matches any sequence of characters within a single directory level
578
+ - ``'**'``: Matches any number of directory levels (must be an entire path component)
579
+ - '?': Matches exactly one character
580
+ - '[abc]': Matches any character in the set
581
+ - '[a-z]': Matches any character in the range
582
+ - '[!abc]': Matches any character NOT in the set
583
+
584
+ Special behaviors:
585
+ - If the path ends with '/', only folders are returned
586
+ - Consecutive '*' characters are compressed into a single '*'
587
+ - Empty brackets '[]' never match anything
588
+ - Negated empty brackets '[!]' match any single character
589
+ - Special characters in character classes are escaped properly
590
+
591
+ Limitations:
592
+ - ``'**'`` must be a complete path component (e.g., ``'a/**/b'``, not ``'a**b'``)
593
+ - No brace expansion ('{a,b}.txt')
594
+ - No extended glob patterns ('+(pattern)', '!(pattern)')
595
+ """
596
+ if maxdepth is not None and maxdepth < 1:
597
+ raise ValueError("maxdepth must be at least 1")
598
+
599
+ import re
600
+
601
+ seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
602
+ ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
603
+ path = self._strip_protocol(path)
604
+ append_slash_to_dirname = ends_with_sep or path.endswith(
605
+ tuple(sep + "**" for sep in seps)
606
+ )
607
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
608
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
609
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
610
+
611
+ min_idx = min(idx_star, idx_qmark, idx_brace)
612
+
613
+ detail = kwargs.pop("detail", False)
614
+
615
+ if not has_magic(path):
616
+ if self.exists(path, **kwargs):
617
+ if not detail:
618
+ return [path]
619
+ else:
620
+ return {path: self.info(path, **kwargs)}
621
+ else:
622
+ if not detail:
623
+ return [] # glob of non-existent returns empty
624
+ else:
625
+ return {}
626
+ elif "/" in path[:min_idx]:
627
+ min_idx = path[:min_idx].rindex("/")
628
+ root = path[: min_idx + 1]
629
+ depth = path[min_idx + 1 :].count("/") + 1
630
+ else:
631
+ root = ""
632
+ depth = path[min_idx + 1 :].count("/") + 1
633
+
634
+ if "**" in path:
635
+ if maxdepth is not None:
636
+ idx_double_stars = path.find("**")
637
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
638
+ depth = depth - depth_double_stars + maxdepth
639
+ else:
640
+ depth = None
641
+
642
+ allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
643
+
644
+ pattern = glob_translate(path + ("/" if ends_with_sep else ""))
645
+ pattern = re.compile(pattern)
646
+
647
+ out = {
648
+ p: info
649
+ for p, info in sorted(allpaths.items())
650
+ if pattern.match(
651
+ p + "/"
652
+ if append_slash_to_dirname and info["type"] == "directory"
653
+ else p
654
+ )
655
+ }
656
+
657
+ if detail:
658
+ return out
659
+ else:
660
+ return list(out)
661
+
662
+ def exists(self, path, **kwargs):
663
+ """Is there a file at the given path"""
664
+ try:
665
+ self.info(path, **kwargs)
666
+ return True
667
+ except: # noqa: E722
668
+ # any exception allowed bar FileNotFoundError?
669
+ return False
670
+
671
+ def lexists(self, path, **kwargs):
672
+ """If there is a file at the given path (including
673
+ broken links)"""
674
+ return self.exists(path)
675
+
676
+ def info(self, path, **kwargs):
677
+ """Give details of entry at path
678
+
679
+ Returns a single dictionary, with exactly the same information as ``ls``
680
+ would with ``detail=True``.
681
+
682
+ The default implementation calls ls and could be overridden by a
683
+ shortcut. kwargs are passed on to ```ls()``.
684
+
685
+ Some file systems might not be able to measure the file's size, in
686
+ which case, the returned dict will include ``'size': None``.
687
+
688
+ Returns
689
+ -------
690
+ dict with keys: name (full path in the FS), size (in bytes), type (file,
691
+ directory, or something else) and other FS-specific keys.
692
+ """
693
+ path = self._strip_protocol(path)
694
+ out = self.ls(self._parent(path), detail=True, **kwargs)
695
+ out = [o for o in out if o["name"].rstrip("/") == path]
696
+ if out:
697
+ return out[0]
698
+ out = self.ls(path, detail=True, **kwargs)
699
+ path = path.rstrip("/")
700
+ out1 = [o for o in out if o["name"].rstrip("/") == path]
701
+ if len(out1) == 1:
702
+ if "size" not in out1[0]:
703
+ out1[0]["size"] = None
704
+ return out1[0]
705
+ elif len(out1) > 1 or out:
706
+ return {"name": path, "size": 0, "type": "directory"}
707
+ else:
708
+ raise FileNotFoundError(path)
709
+
710
+ def checksum(self, path):
711
+ """Unique value for current version of file
712
+
713
+ If the checksum is the same from one moment to another, the contents
714
+ are guaranteed to be the same. If the checksum changes, the contents
715
+ *might* have changed.
716
+
717
+ This should normally be overridden; default will probably capture
718
+ creation/modification timestamp (which would be good) or maybe
719
+ access timestamp (which would be bad)
720
+ """
721
+ return int(tokenize(self.info(path)), 16)
722
+
723
+ def size(self, path):
724
+ """Size in bytes of file"""
725
+ return self.info(path).get("size", None)
726
+
727
+ def sizes(self, paths):
728
+ """Size in bytes of each file in a list of paths"""
729
+ return [self.size(p) for p in paths]
730
+
731
+ def isdir(self, path):
732
+ """Is this entry directory-like?"""
733
+ try:
734
+ return self.info(path)["type"] == "directory"
735
+ except OSError:
736
+ return False
737
+
738
+ def isfile(self, path):
739
+ """Is this entry file-like?"""
740
+ try:
741
+ return self.info(path)["type"] == "file"
742
+ except: # noqa: E722
743
+ return False
744
+
745
+ def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
746
+ """Get the contents of the file as a string.
747
+
748
+ Parameters
749
+ ----------
750
+ path: str
751
+ URL of file on this filesystems
752
+ encoding, errors, newline: same as `open`.
753
+ """
754
+ with self.open(
755
+ path,
756
+ mode="r",
757
+ encoding=encoding,
758
+ errors=errors,
759
+ newline=newline,
760
+ **kwargs,
761
+ ) as f:
762
+ return f.read()
763
+
764
+ def write_text(
765
+ self, path, value, encoding=None, errors=None, newline=None, **kwargs
766
+ ):
767
+ """Write the text to the given file.
768
+
769
+ An existing file will be overwritten.
770
+
771
+ Parameters
772
+ ----------
773
+ path: str
774
+ URL of file on this filesystems
775
+ value: str
776
+ Text to write.
777
+ encoding, errors, newline: same as `open`.
778
+ """
779
+ with self.open(
780
+ path,
781
+ mode="w",
782
+ encoding=encoding,
783
+ errors=errors,
784
+ newline=newline,
785
+ **kwargs,
786
+ ) as f:
787
+ return f.write(value)
788
+
789
+ def cat_file(self, path, start=None, end=None, **kwargs):
790
+ """Get the content of a file
791
+
792
+ Parameters
793
+ ----------
794
+ path: URL of file on this filesystems
795
+ start, end: int
796
+ Bytes limits of the read. If negative, backwards from end,
797
+ like usual python slices. Either can be None for start or
798
+ end of file, respectively
799
+ kwargs: passed to ``open()``.
800
+ """
801
+ # explicitly set buffering off?
802
+ with self.open(path, "rb", **kwargs) as f:
803
+ if start is not None:
804
+ if start >= 0:
805
+ f.seek(start)
806
+ else:
807
+ f.seek(max(0, f.size + start))
808
+ if end is not None:
809
+ if end < 0:
810
+ end = f.size + end
811
+ return f.read(end - f.tell())
812
+ return f.read()
813
+
814
+ def pipe_file(self, path, value, mode="overwrite", **kwargs):
815
+ """Set the bytes of given file"""
816
+ if mode == "create" and self.exists(path):
817
+ # non-atomic but simple way; or could use "xb" in open(), which is likely
818
+ # not as well supported
819
+ raise FileExistsError
820
+ with self.open(path, "wb", **kwargs) as f:
821
+ f.write(value)
822
+
823
+ def pipe(self, path, value=None, **kwargs):
824
+ """Put value into path
825
+
826
+ (counterpart to ``cat``)
827
+
828
+ Parameters
829
+ ----------
830
+ path: string or dict(str, bytes)
831
+ If a string, a single remote location to put ``value`` bytes; if a dict,
832
+ a mapping of {path: bytesvalue}.
833
+ value: bytes, optional
834
+ If using a single path, these are the bytes to put there. Ignored if
835
+ ``path`` is a dict
836
+ """
837
+ if isinstance(path, str):
838
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
839
+ elif isinstance(path, dict):
840
+ for k, v in path.items():
841
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
842
+ else:
843
+ raise ValueError("path must be str or dict")
844
+
845
+ def cat_ranges(
846
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
847
+ ):
848
+ """Get the contents of byte ranges from one or more files
849
+
850
+ Parameters
851
+ ----------
852
+ paths: list
853
+ A list of of filepaths on this filesystems
854
+ starts, ends: int or list
855
+ Bytes limits of the read. If using a single int, the same value will be
856
+ used to read all the specified files.
857
+ """
858
+ if max_gap is not None:
859
+ raise NotImplementedError
860
+ if not isinstance(paths, list):
861
+ raise TypeError
862
+ if not isinstance(starts, list):
863
+ starts = [starts] * len(paths)
864
+ if not isinstance(ends, list):
865
+ ends = [ends] * len(paths)
866
+ if len(starts) != len(paths) or len(ends) != len(paths):
867
+ raise ValueError
868
+ out = []
869
+ for p, s, e in zip(paths, starts, ends):
870
+ try:
871
+ out.append(self.cat_file(p, s, e))
872
+ except Exception as e:
873
+ if on_error == "return":
874
+ out.append(e)
875
+ else:
876
+ raise
877
+ return out
878
+
879
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
880
+ """Fetch (potentially multiple) paths' contents
881
+
882
+ Parameters
883
+ ----------
884
+ recursive: bool
885
+ If True, assume the path(s) are directories, and get all the
886
+ contained files
887
+ on_error : "raise", "omit", "return"
888
+ If raise, an underlying exception will be raised (converted to KeyError
889
+ if the type is in self.missing_exceptions); if omit, keys with exception
890
+ will simply not be included in the output; if "return", all keys are
891
+ included in the output, but the value will be bytes or an exception
892
+ instance.
893
+ kwargs: passed to cat_file
894
+
895
+ Returns
896
+ -------
897
+ dict of {path: contents} if there are multiple paths
898
+ or the path has been otherwise expanded
899
+ """
900
+ paths = self.expand_path(path, recursive=recursive, **kwargs)
901
+ if (
902
+ len(paths) > 1
903
+ or isinstance(path, list)
904
+ or paths[0] != self._strip_protocol(path)
905
+ ):
906
+ out = {}
907
+ for path in paths:
908
+ try:
909
+ out[path] = self.cat_file(path, **kwargs)
910
+ except Exception as e:
911
+ if on_error == "raise":
912
+ raise
913
+ if on_error == "return":
914
+ out[path] = e
915
+ return out
916
+ else:
917
+ return self.cat_file(paths[0], **kwargs)
918
+
919
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
920
+ """Copy single remote file to local"""
921
+ from .implementations.local import LocalFileSystem
922
+
923
+ if isfilelike(lpath):
924
+ outfile = lpath
925
+ elif self.isdir(rpath):
926
+ os.makedirs(lpath, exist_ok=True)
927
+ return None
928
+
929
+ fs = LocalFileSystem(auto_mkdir=True)
930
+ fs.makedirs(fs._parent(lpath), exist_ok=True)
931
+
932
+ with self.open(rpath, "rb", **kwargs) as f1:
933
+ if outfile is None:
934
+ outfile = open(lpath, "wb")
935
+
936
+ try:
937
+ callback.set_size(getattr(f1, "size", None))
938
+ data = True
939
+ while data:
940
+ data = f1.read(self.blocksize)
941
+ segment_len = outfile.write(data)
942
+ if segment_len is None:
943
+ segment_len = len(data)
944
+ callback.relative_update(segment_len)
945
+ finally:
946
+ if not isfilelike(lpath):
947
+ outfile.close()
948
+
949
+ def get(
950
+ self,
951
+ rpath,
952
+ lpath,
953
+ recursive=False,
954
+ callback=DEFAULT_CALLBACK,
955
+ maxdepth=None,
956
+ **kwargs,
957
+ ):
958
+ """Copy file(s) to local.
959
+
960
+ Copies a specific file or tree of files (if recursive=True). If lpath
961
+ ends with a "/", it will be assumed to be a directory, and target files
962
+ will go within. Can submit a list of paths, which may be glob-patterns
963
+ and will be expanded.
964
+
965
+ Calls get_file for each source.
966
+ """
967
+ if isinstance(lpath, list) and isinstance(rpath, list):
968
+ # No need to expand paths when both source and destination
969
+ # are provided as lists
970
+ rpaths = rpath
971
+ lpaths = lpath
972
+ else:
973
+ from .implementations.local import (
974
+ LocalFileSystem,
975
+ make_path_posix,
976
+ trailing_sep,
977
+ )
978
+
979
+ source_is_str = isinstance(rpath, str)
980
+ rpaths = self.expand_path(
981
+ rpath, recursive=recursive, maxdepth=maxdepth, **kwargs
982
+ )
983
+ if source_is_str and (not recursive or maxdepth is not None):
984
+ # Non-recursive glob does not copy directories
985
+ rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
986
+ if not rpaths:
987
+ return
988
+
989
+ if isinstance(lpath, str):
990
+ lpath = make_path_posix(lpath)
991
+
992
+ source_is_file = len(rpaths) == 1
993
+ dest_is_dir = isinstance(lpath, str) and (
994
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
995
+ )
996
+
997
+ exists = source_is_str and (
998
+ (has_magic(rpath) and source_is_file)
999
+ or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
1000
+ )
1001
+ lpaths = other_paths(
1002
+ rpaths,
1003
+ lpath,
1004
+ exists=exists,
1005
+ flatten=not source_is_str,
1006
+ )
1007
+
1008
+ callback.set_size(len(lpaths))
1009
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
1010
+ with callback.branched(rpath, lpath) as child:
1011
+ self.get_file(rpath, lpath, callback=child, **kwargs)
1012
+
1013
+ def put_file(
1014
+ self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
1015
+ ):
1016
+ """Copy single file to remote"""
1017
+ if mode == "create" and self.exists(rpath):
1018
+ raise FileExistsError
1019
+ if os.path.isdir(lpath):
1020
+ self.makedirs(rpath, exist_ok=True)
1021
+ return None
1022
+
1023
+ with open(lpath, "rb") as f1:
1024
+ size = f1.seek(0, 2)
1025
+ callback.set_size(size)
1026
+ f1.seek(0)
1027
+
1028
+ self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
1029
+ with self.open(rpath, "wb", **kwargs) as f2:
1030
+ while f1.tell() < size:
1031
+ data = f1.read(self.blocksize)
1032
+ segment_len = f2.write(data)
1033
+ if segment_len is None:
1034
+ segment_len = len(data)
1035
+ callback.relative_update(segment_len)
1036
+
1037
+ def put(
1038
+ self,
1039
+ lpath,
1040
+ rpath,
1041
+ recursive=False,
1042
+ callback=DEFAULT_CALLBACK,
1043
+ maxdepth=None,
1044
+ **kwargs,
1045
+ ):
1046
+ """Copy file(s) from local.
1047
+
1048
+ Copies a specific file or tree of files (if recursive=True). If rpath
1049
+ ends with a "/", it will be assumed to be a directory, and target files
1050
+ will go within.
1051
+
1052
+ Calls put_file for each source.
1053
+ """
1054
+ if isinstance(lpath, list) and isinstance(rpath, list):
1055
+ # No need to expand paths when both source and destination
1056
+ # are provided as lists
1057
+ rpaths = rpath
1058
+ lpaths = lpath
1059
+ else:
1060
+ from .implementations.local import (
1061
+ LocalFileSystem,
1062
+ make_path_posix,
1063
+ trailing_sep,
1064
+ )
1065
+
1066
+ source_is_str = isinstance(lpath, str)
1067
+ if source_is_str:
1068
+ lpath = make_path_posix(lpath)
1069
+ fs = LocalFileSystem()
1070
+ lpaths = fs.expand_path(
1071
+ lpath, recursive=recursive, maxdepth=maxdepth, **kwargs
1072
+ )
1073
+ if source_is_str and (not recursive or maxdepth is not None):
1074
+ # Non-recursive glob does not copy directories
1075
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
1076
+ if not lpaths:
1077
+ return
1078
+
1079
+ source_is_file = len(lpaths) == 1
1080
+ dest_is_dir = isinstance(rpath, str) and (
1081
+ trailing_sep(rpath) or self.isdir(rpath)
1082
+ )
1083
+
1084
+ rpath = (
1085
+ self._strip_protocol(rpath)
1086
+ if isinstance(rpath, str)
1087
+ else [self._strip_protocol(p) for p in rpath]
1088
+ )
1089
+ exists = source_is_str and (
1090
+ (has_magic(lpath) and source_is_file)
1091
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
1092
+ )
1093
+ rpaths = other_paths(
1094
+ lpaths,
1095
+ rpath,
1096
+ exists=exists,
1097
+ flatten=not source_is_str,
1098
+ )
1099
+
1100
+ callback.set_size(len(rpaths))
1101
+ for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
1102
+ with callback.branched(lpath, rpath) as child:
1103
+ self.put_file(lpath, rpath, callback=child, **kwargs)
1104
+
1105
+ def head(self, path, size=1024):
1106
+ """Get the first ``size`` bytes from file"""
1107
+ with self.open(path, "rb") as f:
1108
+ return f.read(size)
1109
+
1110
+ def tail(self, path, size=1024):
1111
+ """Get the last ``size`` bytes from file"""
1112
+ with self.open(path, "rb") as f:
1113
+ f.seek(max(-size, -f.size), 2)
1114
+ return f.read()
1115
+
1116
+ def cp_file(self, path1, path2, **kwargs):
1117
+ raise NotImplementedError
1118
+
1119
+ def copy(
1120
+ self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
1121
+ ):
1122
+ """Copy within two locations in the filesystem
1123
+
1124
+ on_error : "raise", "ignore"
1125
+ If raise, any not-found exceptions will be raised; if ignore any
1126
+ not-found exceptions will cause the path to be skipped; defaults to
1127
+ raise unless recursive is true, where the default is ignore
1128
+ """
1129
+ if on_error is None and recursive:
1130
+ on_error = "ignore"
1131
+ elif on_error is None:
1132
+ on_error = "raise"
1133
+
1134
+ if isinstance(path1, list) and isinstance(path2, list):
1135
+ # No need to expand paths when both source and destination
1136
+ # are provided as lists
1137
+ paths1 = path1
1138
+ paths2 = path2
1139
+ else:
1140
+ from .implementations.local import trailing_sep
1141
+
1142
+ source_is_str = isinstance(path1, str)
1143
+ paths1 = self.expand_path(
1144
+ path1, recursive=recursive, maxdepth=maxdepth, **kwargs
1145
+ )
1146
+ if source_is_str and (not recursive or maxdepth is not None):
1147
+ # Non-recursive glob does not copy directories
1148
+ paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
1149
+ if not paths1:
1150
+ return
1151
+
1152
+ source_is_file = len(paths1) == 1
1153
+ dest_is_dir = isinstance(path2, str) and (
1154
+ trailing_sep(path2) or self.isdir(path2)
1155
+ )
1156
+
1157
+ exists = source_is_str and (
1158
+ (has_magic(path1) and source_is_file)
1159
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
1160
+ )
1161
+ paths2 = other_paths(
1162
+ paths1,
1163
+ path2,
1164
+ exists=exists,
1165
+ flatten=not source_is_str,
1166
+ )
1167
+
1168
+ for p1, p2 in zip(paths1, paths2):
1169
+ try:
1170
+ self.cp_file(p1, p2, **kwargs)
1171
+ except FileNotFoundError:
1172
+ if on_error == "raise":
1173
+ raise
1174
+
1175
+ def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
1176
+ """Turn one or more globs or directories into a list of all matching paths
1177
+ to files or directories.
1178
+
1179
+ kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
1180
+ """
1181
+
1182
+ if maxdepth is not None and maxdepth < 1:
1183
+ raise ValueError("maxdepth must be at least 1")
1184
+
1185
+ if isinstance(path, (str, os.PathLike)):
1186
+ out = self.expand_path([path], recursive, maxdepth, **kwargs)
1187
+ else:
1188
+ out = set()
1189
+ path = [self._strip_protocol(p) for p in path]
1190
+ for p in path:
1191
+ if has_magic(p):
1192
+ bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
1193
+ out |= bit
1194
+ if recursive:
1195
+ # glob call above expanded one depth so if maxdepth is defined
1196
+ # then decrement it in expand_path call below. If it is zero
1197
+ # after decrementing then avoid expand_path call.
1198
+ if maxdepth is not None and maxdepth <= 1:
1199
+ continue
1200
+ out |= set(
1201
+ self.expand_path(
1202
+ list(bit),
1203
+ recursive=recursive,
1204
+ maxdepth=maxdepth - 1 if maxdepth is not None else None,
1205
+ **kwargs,
1206
+ )
1207
+ )
1208
+ continue
1209
+ elif recursive:
1210
+ rec = set(
1211
+ self.find(
1212
+ p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
1213
+ )
1214
+ )
1215
+ out |= rec
1216
+ if p not in out and (recursive is False or self.exists(p)):
1217
+ # should only check once, for the root
1218
+ out.add(p)
1219
+ if not out:
1220
+ raise FileNotFoundError(path)
1221
+ return sorted(out)
1222
+
1223
+ def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
1224
+ """Move file(s) from one location to another"""
1225
+ if path1 == path2:
1226
+ logger.debug("%s mv: The paths are the same, so no files were moved.", self)
1227
+ else:
1228
+ # explicitly raise exception to prevent data corruption
1229
+ self.copy(
1230
+ path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
1231
+ )
1232
+ self.rm(path1, recursive=recursive)
1233
+
1234
+ def rm_file(self, path):
1235
+ """Delete a file"""
1236
+ self._rm(path)
1237
+
1238
+ def _rm(self, path):
1239
+ """Delete one file"""
1240
+ # this is the old name for the method, prefer rm_file
1241
+ raise NotImplementedError
1242
+
1243
+ def rm(self, path, recursive=False, maxdepth=None):
1244
+ """Delete files.
1245
+
1246
+ Parameters
1247
+ ----------
1248
+ path: str or list of str
1249
+ File(s) to delete.
1250
+ recursive: bool
1251
+ If file(s) are directories, recursively delete contents and then
1252
+ also remove the directory
1253
+ maxdepth: int or None
1254
+ Depth to pass to walk for finding files to delete, if recursive.
1255
+ If None, there will be no limit and infinite recursion may be
1256
+ possible.
1257
+ """
1258
+ path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
1259
+ for p in reversed(path):
1260
+ self.rm_file(p)
1261
+
1262
+ @classmethod
1263
+ def _parent(cls, path):
1264
+ path = cls._strip_protocol(path)
1265
+ if "/" in path:
1266
+ parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
1267
+ return cls.root_marker + parent
1268
+ else:
1269
+ return cls.root_marker
1270
+
1271
+ def _open(
1272
+ self,
1273
+ path,
1274
+ mode="rb",
1275
+ block_size=None,
1276
+ autocommit=True,
1277
+ cache_options=None,
1278
+ **kwargs,
1279
+ ):
1280
+ """Return raw bytes-mode file-like from the file-system"""
1281
+ return AbstractBufferedFile(
1282
+ self,
1283
+ path,
1284
+ mode,
1285
+ block_size,
1286
+ autocommit,
1287
+ cache_options=cache_options,
1288
+ **kwargs,
1289
+ )
1290
+
1291
+ def open(
1292
+ self,
1293
+ path,
1294
+ mode="rb",
1295
+ block_size=None,
1296
+ cache_options=None,
1297
+ compression=None,
1298
+ **kwargs,
1299
+ ):
1300
+ """
1301
+ Return a file-like object from the filesystem
1302
+
1303
+ The resultant instance must function correctly in a context ``with``
1304
+ block.
1305
+
1306
+ Parameters
1307
+ ----------
1308
+ path: str
1309
+ Target file
1310
+ mode: str like 'rb', 'w'
1311
+ See builtin ``open()``
1312
+ Mode "x" (exclusive write) may be implemented by the backend. Even if
1313
+ it is, whether it is checked up front or on commit, and whether it is
1314
+ atomic is implementation-dependent.
1315
+ block_size: int
1316
+ Some indication of buffering - this is a value in bytes
1317
+ cache_options : dict, optional
1318
+ Extra arguments to pass through to the cache.
1319
+ compression: string or None
1320
+ If given, open file using compression codec. Can either be a compression
1321
+ name (a key in ``fsspec.compression.compr``) or "infer" to guess the
1322
+ compression from the filename suffix.
1323
+ encoding, errors, newline: passed on to TextIOWrapper for text mode
1324
+ """
1325
+ import io
1326
+
1327
+ path = self._strip_protocol(path)
1328
+ if "b" not in mode:
1329
+ mode = mode.replace("t", "") + "b"
1330
+
1331
+ text_kwargs = {
1332
+ k: kwargs.pop(k)
1333
+ for k in ["encoding", "errors", "newline"]
1334
+ if k in kwargs
1335
+ }
1336
+ return io.TextIOWrapper(
1337
+ self.open(
1338
+ path,
1339
+ mode,
1340
+ block_size=block_size,
1341
+ cache_options=cache_options,
1342
+ compression=compression,
1343
+ **kwargs,
1344
+ ),
1345
+ **text_kwargs,
1346
+ )
1347
+ else:
1348
+ ac = kwargs.pop("autocommit", not self._intrans)
1349
+ f = self._open(
1350
+ path,
1351
+ mode=mode,
1352
+ block_size=block_size,
1353
+ autocommit=ac,
1354
+ cache_options=cache_options,
1355
+ **kwargs,
1356
+ )
1357
+ if compression is not None:
1358
+ from fsspec.compression import compr
1359
+ from fsspec.core import get_compression
1360
+
1361
+ compression = get_compression(path, compression)
1362
+ compress = compr[compression]
1363
+ f = compress(f, mode=mode[0])
1364
+
1365
+ if not ac and "r" not in mode:
1366
+ self.transaction.files.append(f)
1367
+ return f
1368
+
1369
+ def touch(self, path, truncate=True, **kwargs):
1370
+ """Create empty file, or update timestamp
1371
+
1372
+ Parameters
1373
+ ----------
1374
+ path: str
1375
+ file location
1376
+ truncate: bool
1377
+ If True, always set file size to 0; if False, update timestamp and
1378
+ leave file unchanged, if backend allows this
1379
+ """
1380
+ if truncate or not self.exists(path):
1381
+ with self.open(path, "wb", **kwargs):
1382
+ pass
1383
+ else:
1384
+ raise NotImplementedError # update timestamp, if possible
1385
+
1386
+ def ukey(self, path):
1387
+ """Hash of file properties, to tell if it has changed"""
1388
+ return sha256(str(self.info(path)).encode()).hexdigest()
1389
+
1390
+ def read_block(self, fn, offset, length, delimiter=None):
1391
+ """Read a block of bytes from
1392
+
1393
+ Starting at ``offset`` of the file, read ``length`` bytes. If
1394
+ ``delimiter`` is set then we ensure that the read starts and stops at
1395
+ delimiter boundaries that follow the locations ``offset`` and ``offset
1396
+ + length``. If ``offset`` is zero then we start at zero. The
1397
+ bytestring returned WILL include the end delimiter string.
1398
+
1399
+ If offset+length is beyond the eof, reads to eof.
1400
+
1401
+ Parameters
1402
+ ----------
1403
+ fn: string
1404
+ Path to filename
1405
+ offset: int
1406
+ Byte offset to start read
1407
+ length: int
1408
+ Number of bytes to read. If None, read to end.
1409
+ delimiter: bytes (optional)
1410
+ Ensure reading starts and stops at delimiter bytestring
1411
+
1412
+ Examples
1413
+ --------
1414
+ >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
1415
+ b'Alice, 100\\nBo'
1416
+ >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
1417
+ b'Alice, 100\\nBob, 200\\n'
1418
+
1419
+ Use ``length=None`` to read to the end of the file.
1420
+ >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
1421
+ b'Alice, 100\\nBob, 200\\nCharlie, 300'
1422
+
1423
+ See Also
1424
+ --------
1425
+ :func:`fsspec.utils.read_block`
1426
+ """
1427
+ with self.open(fn, "rb") as f:
1428
+ size = f.size
1429
+ if length is None:
1430
+ length = size
1431
+ if size is not None and offset + length > size:
1432
+ length = size - offset
1433
+ return read_block(f, offset, length, delimiter)
1434
+
1435
+ def to_json(self, *, include_password: bool = True) -> str:
1436
+ """
1437
+ JSON representation of this filesystem instance.
1438
+
1439
+ Parameters
1440
+ ----------
1441
+ include_password: bool, default True
1442
+ Whether to include the password (if any) in the output.
1443
+
1444
+ Returns
1445
+ -------
1446
+ JSON string with keys ``cls`` (the python location of this class),
1447
+ protocol (text name of this class's protocol, first one in case of
1448
+ multiple), ``args`` (positional args, usually empty), and all other
1449
+ keyword arguments as their own keys.
1450
+
1451
+ Warnings
1452
+ --------
1453
+ Serialized filesystems may contain sensitive information which have been
1454
+ passed to the constructor, such as passwords and tokens. Make sure you
1455
+ store and send them in a secure environment!
1456
+ """
1457
+ from .json import FilesystemJSONEncoder
1458
+
1459
+ return json.dumps(
1460
+ self,
1461
+ cls=type(
1462
+ "_FilesystemJSONEncoder",
1463
+ (FilesystemJSONEncoder,),
1464
+ {"include_password": include_password},
1465
+ ),
1466
+ )
1467
+
1468
+ @staticmethod
1469
+ def from_json(blob: str) -> AbstractFileSystem:
1470
+ """
1471
+ Recreate a filesystem instance from JSON representation.
1472
+
1473
+ See ``.to_json()`` for the expected structure of the input.
1474
+
1475
+ Parameters
1476
+ ----------
1477
+ blob: str
1478
+
1479
+ Returns
1480
+ -------
1481
+ file system instance, not necessarily of this particular class.
1482
+
1483
+ Warnings
1484
+ --------
1485
+ This can import arbitrary modules (as determined by the ``cls`` key).
1486
+ Make sure you haven't installed any modules that may execute malicious code
1487
+ at import time.
1488
+ """
1489
+ from .json import FilesystemJSONDecoder
1490
+
1491
+ return json.loads(blob, cls=FilesystemJSONDecoder)
1492
+
1493
+ def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
1494
+ """
1495
+ JSON-serializable dictionary representation of this filesystem instance.
1496
+
1497
+ Parameters
1498
+ ----------
1499
+ include_password: bool, default True
1500
+ Whether to include the password (if any) in the output.
1501
+
1502
+ Returns
1503
+ -------
1504
+ Dictionary with keys ``cls`` (the python location of this class),
1505
+ protocol (text name of this class's protocol, first one in case of
1506
+ multiple), ``args`` (positional args, usually empty), and all other
1507
+ keyword arguments as their own keys.
1508
+
1509
+ Warnings
1510
+ --------
1511
+ Serialized filesystems may contain sensitive information which have been
1512
+ passed to the constructor, such as passwords and tokens. Make sure you
1513
+ store and send them in a secure environment!
1514
+ """
1515
+ from .json import FilesystemJSONEncoder
1516
+
1517
+ json_encoder = FilesystemJSONEncoder()
1518
+
1519
+ cls = type(self)
1520
+ proto = self.protocol
1521
+
1522
+ storage_options = dict(self.storage_options)
1523
+ if not include_password:
1524
+ storage_options.pop("password", None)
1525
+
1526
+ return dict(
1527
+ cls=f"{cls.__module__}:{cls.__name__}",
1528
+ protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
1529
+ args=json_encoder.make_serializable(self.storage_args),
1530
+ **json_encoder.make_serializable(storage_options),
1531
+ )
1532
+
1533
+ @staticmethod
1534
+ def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
1535
+ """
1536
+ Recreate a filesystem instance from dictionary representation.
1537
+
1538
+ See ``.to_dict()`` for the expected structure of the input.
1539
+
1540
+ Parameters
1541
+ ----------
1542
+ dct: Dict[str, Any]
1543
+
1544
+ Returns
1545
+ -------
1546
+ file system instance, not necessarily of this particular class.
1547
+
1548
+ Warnings
1549
+ --------
1550
+ This can import arbitrary modules (as determined by the ``cls`` key).
1551
+ Make sure you haven't installed any modules that may execute malicious code
1552
+ at import time.
1553
+ """
1554
+ from .json import FilesystemJSONDecoder
1555
+
1556
+ json_decoder = FilesystemJSONDecoder()
1557
+
1558
+ dct = dict(dct) # Defensive copy
1559
+
1560
+ cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
1561
+ if cls is None:
1562
+ raise ValueError("Not a serialized AbstractFileSystem")
1563
+
1564
+ dct.pop("cls", None)
1565
+ dct.pop("protocol", None)
1566
+
1567
+ return cls(
1568
+ *json_decoder.unmake_serializable(dct.pop("args", ())),
1569
+ **json_decoder.unmake_serializable(dct),
1570
+ )
1571
+
1572
+ def _get_pyarrow_filesystem(self):
1573
+ """
1574
+ Make a version of the FS instance which will be acceptable to pyarrow
1575
+ """
1576
+ # all instances already also derive from pyarrow
1577
+ return self
1578
+
1579
+ def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
1580
+ """Create key/value store based on this file-system
1581
+
1582
+ Makes a MutableMapping interface to the FS at the given root path.
1583
+ See ``fsspec.mapping.FSMap`` for further details.
1584
+ """
1585
+ from .mapping import FSMap
1586
+
1587
+ return FSMap(
1588
+ root,
1589
+ self,
1590
+ check=check,
1591
+ create=create,
1592
+ missing_exceptions=missing_exceptions,
1593
+ )
1594
+
1595
+ @classmethod
1596
+ def clear_instance_cache(cls):
1597
+ """
1598
+ Clear the cache of filesystem instances.
1599
+
1600
+ Notes
1601
+ -----
1602
+ Unless overridden by setting the ``cachable`` class attribute to False,
1603
+ the filesystem class stores a reference to newly created instances. This
1604
+ prevents Python's normal rules around garbage collection from working,
1605
+ since the instances refcount will not drop to zero until
1606
+ ``clear_instance_cache`` is called.
1607
+ """
1608
+ cls._cache.clear()
1609
+
1610
+ def created(self, path):
1611
+ """Return the created timestamp of a file as a datetime.datetime"""
1612
+ raise NotImplementedError
1613
+
1614
+ def modified(self, path):
1615
+ """Return the modified timestamp of a file as a datetime.datetime"""
1616
+ raise NotImplementedError
1617
+
1618
+ def tree(
1619
+ self,
1620
+ path: str = "/",
1621
+ recursion_limit: int = 2,
1622
+ max_display: int = 25,
1623
+ display_size: bool = False,
1624
+ prefix: str = "",
1625
+ is_last: bool = True,
1626
+ first: bool = True,
1627
+ indent_size: int = 4,
1628
+ ) -> str:
1629
+ """
1630
+ Return a tree-like structure of the filesystem starting from the given path as a string.
1631
+
1632
+ Parameters
1633
+ ----------
1634
+ path: Root path to start traversal from
1635
+ recursion_limit: Maximum depth of directory traversal
1636
+ max_display: Maximum number of items to display per directory
1637
+ display_size: Whether to display file sizes
1638
+ prefix: Current line prefix for visual tree structure
1639
+ is_last: Whether current item is last in its level
1640
+ first: Whether this is the first call (displays root path)
1641
+ indent_size: Number of spaces by indent
1642
+
1643
+ Returns
1644
+ -------
1645
+ str: A string representing the tree structure.
1646
+
1647
+ Example
1648
+ -------
1649
+ >>> from fsspec import filesystem
1650
+
1651
+ >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
1652
+ >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
1653
+ >>> print(tree)
1654
+ """
1655
+
1656
+ def format_bytes(n: int) -> str:
1657
+ """Format bytes as text."""
1658
+ for prefix, k in (
1659
+ ("P", 2**50),
1660
+ ("T", 2**40),
1661
+ ("G", 2**30),
1662
+ ("M", 2**20),
1663
+ ("k", 2**10),
1664
+ ):
1665
+ if n >= 0.9 * k:
1666
+ return f"{n / k:.2f} {prefix}b"
1667
+ return f"{n}B"
1668
+
1669
+ result = []
1670
+
1671
+ if first:
1672
+ result.append(path)
1673
+
1674
+ if recursion_limit:
1675
+ indent = " " * indent_size
1676
+ contents = self.ls(path, detail=True)
1677
+ contents.sort(
1678
+ key=lambda x: (x.get("type") != "directory", x.get("name", ""))
1679
+ )
1680
+
1681
+ if max_display is not None and len(contents) > max_display:
1682
+ displayed_contents = contents[:max_display]
1683
+ remaining_count = len(contents) - max_display
1684
+ else:
1685
+ displayed_contents = contents
1686
+ remaining_count = 0
1687
+
1688
+ for i, item in enumerate(displayed_contents):
1689
+ is_last_item = (i == len(displayed_contents) - 1) and (
1690
+ remaining_count == 0
1691
+ )
1692
+
1693
+ branch = (
1694
+ "└" + ("─" * (indent_size - 2))
1695
+ if is_last_item
1696
+ else "├" + ("─" * (indent_size - 2))
1697
+ )
1698
+ branch += " "
1699
+ new_prefix = prefix + (
1700
+ indent if is_last_item else "│" + " " * (indent_size - 1)
1701
+ )
1702
+
1703
+ name = os.path.basename(item.get("name", ""))
1704
+
1705
+ if display_size and item.get("type") == "directory":
1706
+ sub_contents = self.ls(item.get("name", ""), detail=True)
1707
+ num_files = sum(
1708
+ 1 for sub_item in sub_contents if sub_item.get("type") == "file"
1709
+ )
1710
+ num_folders = sum(
1711
+ 1
1712
+ for sub_item in sub_contents
1713
+ if sub_item.get("type") == "directory"
1714
+ )
1715
+
1716
+ if num_files == 0 and num_folders == 0:
1717
+ size = " (empty folder)"
1718
+ elif num_files == 0:
1719
+ size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
1720
+ elif num_folders == 0:
1721
+ size = f" ({num_files} file{'s' if num_files > 1 else ''})"
1722
+ else:
1723
+ size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
1724
+ elif display_size and item.get("type") == "file":
1725
+ size = f" ({format_bytes(item.get('size', 0))})"
1726
+ else:
1727
+ size = ""
1728
+
1729
+ result.append(f"{prefix}{branch}{name}{size}")
1730
+
1731
+ if item.get("type") == "directory" and recursion_limit > 0:
1732
+ result.append(
1733
+ self.tree(
1734
+ path=item.get("name", ""),
1735
+ recursion_limit=recursion_limit - 1,
1736
+ max_display=max_display,
1737
+ display_size=display_size,
1738
+ prefix=new_prefix,
1739
+ is_last=is_last_item,
1740
+ first=False,
1741
+ indent_size=indent_size,
1742
+ )
1743
+ )
1744
+
1745
+ if remaining_count > 0:
1746
+ more_message = f"{remaining_count} more item(s) not displayed."
1747
+ result.append(
1748
+ f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
1749
+ )
1750
+
1751
+ return "\n".join(_ for _ in result if _)
1752
+
1753
+ # ------------------------------------------------------------------------
1754
+ # Aliases
1755
+
1756
+ def read_bytes(self, path, start=None, end=None, **kwargs):
1757
+ """Alias of `AbstractFileSystem.cat_file`."""
1758
+ return self.cat_file(path, start=start, end=end, **kwargs)
1759
+
1760
+ def write_bytes(self, path, value, **kwargs):
1761
+ """Alias of `AbstractFileSystem.pipe_file`."""
1762
+ self.pipe_file(path, value, **kwargs)
1763
+
1764
+ def makedir(self, path, create_parents=True, **kwargs):
1765
+ """Alias of `AbstractFileSystem.mkdir`."""
1766
+ return self.mkdir(path, create_parents=create_parents, **kwargs)
1767
+
1768
+ def mkdirs(self, path, exist_ok=False):
1769
+ """Alias of `AbstractFileSystem.makedirs`."""
1770
+ return self.makedirs(path, exist_ok=exist_ok)
1771
+
1772
+ def listdir(self, path, detail=True, **kwargs):
1773
+ """Alias of `AbstractFileSystem.ls`."""
1774
+ return self.ls(path, detail=detail, **kwargs)
1775
+
1776
+ def cp(self, path1, path2, **kwargs):
1777
+ """Alias of `AbstractFileSystem.copy`."""
1778
+ return self.copy(path1, path2, **kwargs)
1779
+
1780
+ def move(self, path1, path2, **kwargs):
1781
+ """Alias of `AbstractFileSystem.mv`."""
1782
+ return self.mv(path1, path2, **kwargs)
1783
+
1784
+ def stat(self, path, **kwargs):
1785
+ """Alias of `AbstractFileSystem.info`."""
1786
+ return self.info(path, **kwargs)
1787
+
1788
+ def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
1789
+ """Alias of `AbstractFileSystem.du`."""
1790
+ return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
1791
+
1792
+ def rename(self, path1, path2, **kwargs):
1793
+ """Alias of `AbstractFileSystem.mv`."""
1794
+ return self.mv(path1, path2, **kwargs)
1795
+
1796
+ def delete(self, path, recursive=False, maxdepth=None):
1797
+ """Alias of `AbstractFileSystem.rm`."""
1798
+ return self.rm(path, recursive=recursive, maxdepth=maxdepth)
1799
+
1800
+ def upload(self, lpath, rpath, recursive=False, **kwargs):
1801
+ """Alias of `AbstractFileSystem.put`."""
1802
+ return self.put(lpath, rpath, recursive=recursive, **kwargs)
1803
+
1804
+ def download(self, rpath, lpath, recursive=False, **kwargs):
1805
+ """Alias of `AbstractFileSystem.get`."""
1806
+ return self.get(rpath, lpath, recursive=recursive, **kwargs)
1807
+
1808
+ def sign(self, path, expiration=100, **kwargs):
1809
+ """Create a signed URL representing the given path
1810
+
1811
+ Some implementations allow temporary URLs to be generated, as a
1812
+ way of delegating credentials.
1813
+
1814
+ Parameters
1815
+ ----------
1816
+ path : str
1817
+ The path on the filesystem
1818
+ expiration : int
1819
+ Number of seconds to enable the URL for (if supported)
1820
+
1821
+ Returns
1822
+ -------
1823
+ URL : str
1824
+ The signed URL
1825
+
1826
+ Raises
1827
+ ------
1828
+ NotImplementedError : if method is not implemented for a filesystem
1829
+ """
1830
+ raise NotImplementedError("Sign is not implemented for this filesystem")
1831
+
1832
+ def _isfilestore(self):
1833
+ # Originally inherited from pyarrow DaskFileSystem. Keeping this
1834
+ # here for backwards compatibility as long as pyarrow uses its
1835
+ # legacy fsspec-compatible filesystems and thus accepts fsspec
1836
+ # filesystems as well
1837
+ return False
1838
+
1839
+
1840
+ class AbstractBufferedFile(io.IOBase):
1841
+ """Convenient class to derive from to provide buffering
1842
+
1843
+ In the case that the backend does not provide a pythonic file-like object
1844
+ already, this class contains much of the logic to build one. The only
1845
+ methods that need to be overridden are ``_upload_chunk``,
1846
+ ``_initiate_upload`` and ``_fetch_range``.
1847
+ """
1848
+
1849
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
1850
+ _details = None
1851
+
1852
+ def __init__(
1853
+ self,
1854
+ fs,
1855
+ path,
1856
+ mode="rb",
1857
+ block_size="default",
1858
+ autocommit=True,
1859
+ cache_type="readahead",
1860
+ cache_options=None,
1861
+ size=None,
1862
+ **kwargs,
1863
+ ):
1864
+ """
1865
+ Template for files with buffered reading and writing
1866
+
1867
+ Parameters
1868
+ ----------
1869
+ fs: instance of FileSystem
1870
+ path: str
1871
+ location in file-system
1872
+ mode: str
1873
+ Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
1874
+ systems may be read-only, and some may not support append.
1875
+ block_size: int
1876
+ Buffer size for reading or writing, 'default' for class default
1877
+ autocommit: bool
1878
+ Whether to write to final destination; may only impact what
1879
+ happens when file is being closed.
1880
+ cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
1881
+ Caching policy in read mode. See the definitions in ``core``.
1882
+ cache_options : dict
1883
+ Additional options passed to the constructor for the cache specified
1884
+ by `cache_type`.
1885
+ size: int
1886
+ If given and in read mode, suppressed having to look up the file size
1887
+ kwargs:
1888
+ Gets stored as self.kwargs
1889
+ """
1890
+ from .core import caches
1891
+
1892
+ self.path = path
1893
+ self.fs = fs
1894
+ self.mode = mode
1895
+ self.blocksize = (
1896
+ self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
1897
+ )
1898
+ self.loc = 0
1899
+ self.autocommit = autocommit
1900
+ self.end = None
1901
+ self.start = None
1902
+ self.closed = False
1903
+
1904
+ if cache_options is None:
1905
+ cache_options = {}
1906
+
1907
+ if "trim" in kwargs:
1908
+ warnings.warn(
1909
+ "Passing 'trim' to control the cache behavior has been deprecated. "
1910
+ "Specify it within the 'cache_options' argument instead.",
1911
+ FutureWarning,
1912
+ )
1913
+ cache_options["trim"] = kwargs.pop("trim")
1914
+
1915
+ self.kwargs = kwargs
1916
+
1917
+ if mode not in {"ab", "rb", "wb", "xb"}:
1918
+ raise NotImplementedError("File mode not supported")
1919
+ if mode == "rb":
1920
+ if size is not None:
1921
+ self.size = size
1922
+ else:
1923
+ self.size = self.details["size"]
1924
+ self.cache = caches[cache_type](
1925
+ self.blocksize, self._fetch_range, self.size, **cache_options
1926
+ )
1927
+ else:
1928
+ self.buffer = io.BytesIO()
1929
+ self.offset = None
1930
+ self.forced = False
1931
+ self.location = None
1932
+
1933
+ @property
1934
+ def details(self):
1935
+ if self._details is None:
1936
+ self._details = self.fs.info(self.path)
1937
+ return self._details
1938
+
1939
+ @details.setter
1940
+ def details(self, value):
1941
+ self._details = value
1942
+ self.size = value["size"]
1943
+
1944
+ @property
1945
+ def full_name(self):
1946
+ return _unstrip_protocol(self.path, self.fs)
1947
+
1948
+ @property
1949
+ def closed(self):
1950
+ # get around this attr being read-only in IOBase
1951
+ # use getattr here, since this can be called during del
1952
+ return getattr(self, "_closed", True)
1953
+
1954
+ @closed.setter
1955
+ def closed(self, c):
1956
+ self._closed = c
1957
+
1958
+ def __hash__(self):
1959
+ if "w" in self.mode:
1960
+ return id(self)
1961
+ else:
1962
+ return int(tokenize(self.details), 16)
1963
+
1964
+ def __eq__(self, other):
1965
+ """Files are equal if they have the same checksum, only in read mode"""
1966
+ if self is other:
1967
+ return True
1968
+ return (
1969
+ isinstance(other, type(self))
1970
+ and self.mode == "rb"
1971
+ and other.mode == "rb"
1972
+ and hash(self) == hash(other)
1973
+ )
1974
+
1975
+ def commit(self):
1976
+ """Move from temp to final destination"""
1977
+
1978
+ def discard(self):
1979
+ """Throw away temporary file"""
1980
+
1981
+ def info(self):
1982
+ """File information about this path"""
1983
+ if self.readable():
1984
+ return self.details
1985
+ else:
1986
+ raise ValueError("Info not available while writing")
1987
+
1988
+ def tell(self):
1989
+ """Current file location"""
1990
+ return self.loc
1991
+
1992
+ def seek(self, loc, whence=0):
1993
+ """Set current file location
1994
+
1995
+ Parameters
1996
+ ----------
1997
+ loc: int
1998
+ byte location
1999
+ whence: {0, 1, 2}
2000
+ from start of file, current location or end of file, resp.
2001
+ """
2002
+ loc = int(loc)
2003
+ if not self.mode == "rb":
2004
+ raise OSError(ESPIPE, "Seek only available in read mode")
2005
+ if whence == 0:
2006
+ nloc = loc
2007
+ elif whence == 1:
2008
+ nloc = self.loc + loc
2009
+ elif whence == 2:
2010
+ nloc = self.size + loc
2011
+ else:
2012
+ raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
2013
+ if nloc < 0:
2014
+ raise ValueError("Seek before start of file")
2015
+ self.loc = nloc
2016
+ return self.loc
2017
+
2018
+ def write(self, data):
2019
+ """
2020
+ Write data to buffer.
2021
+
2022
+ Buffer only sent on flush() or if buffer is greater than
2023
+ or equal to blocksize.
2024
+
2025
+ Parameters
2026
+ ----------
2027
+ data: bytes
2028
+ Set of bytes to be written.
2029
+ """
2030
+ if not self.writable():
2031
+ raise ValueError("File not in write mode")
2032
+ if self.closed:
2033
+ raise ValueError("I/O operation on closed file.")
2034
+ if self.forced:
2035
+ raise ValueError("This file has been force-flushed, can only close")
2036
+ out = self.buffer.write(data)
2037
+ self.loc += out
2038
+ if self.buffer.tell() >= self.blocksize:
2039
+ self.flush()
2040
+ return out
2041
+
2042
+ def flush(self, force=False):
2043
+ """
2044
+ Write buffered data to backend store.
2045
+
2046
+ Writes the current buffer, if it is larger than the block-size, or if
2047
+ the file is being closed.
2048
+
2049
+ Parameters
2050
+ ----------
2051
+ force: bool
2052
+ When closing, write the last block even if it is smaller than
2053
+ blocks are allowed to be. Disallows further writing to this file.
2054
+ """
2055
+
2056
+ if self.closed:
2057
+ raise ValueError("Flush on closed file")
2058
+ if force and self.forced:
2059
+ raise ValueError("Force flush cannot be called more than once")
2060
+ if force:
2061
+ self.forced = True
2062
+
2063
+ if self.readable():
2064
+ # no-op to flush on read-mode
2065
+ return
2066
+
2067
+ if not force and self.buffer.tell() < self.blocksize:
2068
+ # Defer write on small block
2069
+ return
2070
+
2071
+ if self.offset is None:
2072
+ # Initialize a multipart upload
2073
+ self.offset = 0
2074
+ try:
2075
+ self._initiate_upload()
2076
+ except:
2077
+ self.closed = True
2078
+ raise
2079
+
2080
+ if self._upload_chunk(final=force) is not False:
2081
+ self.offset += self.buffer.seek(0, 2)
2082
+ self.buffer = io.BytesIO()
2083
+
2084
+ def _upload_chunk(self, final=False):
2085
+ """Write one part of a multi-block file upload
2086
+
2087
+ Parameters
2088
+ ==========
2089
+ final: bool
2090
+ This is the last block, so should complete file, if
2091
+ self.autocommit is True.
2092
+ """
2093
+ # may not yet have been initialized, may need to call _initialize_upload
2094
+
2095
+ def _initiate_upload(self):
2096
+ """Create remote file/upload"""
2097
+ pass
2098
+
2099
+ def _fetch_range(self, start, end):
2100
+ """Get the specified set of bytes from remote"""
2101
+ return self.fs.cat_file(self.path, start=start, end=end)
2102
+
2103
+ def read(self, length=-1):
2104
+ """
2105
+ Return data from cache, or fetch pieces as necessary
2106
+
2107
+ Parameters
2108
+ ----------
2109
+ length: int (-1)
2110
+ Number of bytes to read; if <0, all remaining bytes.
2111
+ """
2112
+ length = -1 if length is None else int(length)
2113
+ if self.mode != "rb":
2114
+ raise ValueError("File not in read mode")
2115
+ if length < 0:
2116
+ length = self.size - self.loc
2117
+ if self.closed:
2118
+ raise ValueError("I/O operation on closed file.")
2119
+ if length == 0:
2120
+ # don't even bother calling fetch
2121
+ return b""
2122
+ out = self.cache._fetch(self.loc, self.loc + length)
2123
+
2124
+ logger.debug(
2125
+ "%s read: %i - %i %s",
2126
+ self,
2127
+ self.loc,
2128
+ self.loc + length,
2129
+ self.cache._log_stats(),
2130
+ )
2131
+ self.loc += len(out)
2132
+ return out
2133
+
2134
+ def readinto(self, b):
2135
+ """mirrors builtin file's readinto method
2136
+
2137
+ https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
2138
+ """
2139
+ out = memoryview(b).cast("B")
2140
+ data = self.read(out.nbytes)
2141
+ out[: len(data)] = data
2142
+ return len(data)
2143
+
2144
+ def readuntil(self, char=b"\n", blocks=None):
2145
+ """Return data between current position and first occurrence of char
2146
+
2147
+ char is included in the output, except if the end of the tile is
2148
+ encountered first.
2149
+
2150
+ Parameters
2151
+ ----------
2152
+ char: bytes
2153
+ Thing to find
2154
+ blocks: None or int
2155
+ How much to read in each go. Defaults to file blocksize - which may
2156
+ mean a new read on every call.
2157
+ """
2158
+ out = []
2159
+ while True:
2160
+ start = self.tell()
2161
+ part = self.read(blocks or self.blocksize)
2162
+ if len(part) == 0:
2163
+ break
2164
+ found = part.find(char)
2165
+ if found > -1:
2166
+ out.append(part[: found + len(char)])
2167
+ self.seek(start + found + len(char))
2168
+ break
2169
+ out.append(part)
2170
+ return b"".join(out)
2171
+
2172
+ def readline(self):
2173
+ """Read until and including the first occurrence of newline character
2174
+
2175
+ Note that, because of character encoding, this is not necessarily a
2176
+ true line ending.
2177
+ """
2178
+ return self.readuntil(b"\n")
2179
+
2180
+ def __next__(self):
2181
+ out = self.readline()
2182
+ if out:
2183
+ return out
2184
+ raise StopIteration
2185
+
2186
+ def __iter__(self):
2187
+ return self
2188
+
2189
+ def readlines(self):
2190
+ """Return all data, split by the newline character, including the newline character"""
2191
+ data = self.read()
2192
+ lines = data.split(b"\n")
2193
+ out = [l + b"\n" for l in lines[:-1]]
2194
+ if data.endswith(b"\n"):
2195
+ return out
2196
+ else:
2197
+ return out + [lines[-1]]
2198
+ # return list(self) ???
2199
+
2200
+ def readinto1(self, b):
2201
+ return self.readinto(b)
2202
+
2203
+ def close(self):
2204
+ """Close file
2205
+
2206
+ Finalizes writes, discards cache
2207
+ """
2208
+ if getattr(self, "_unclosable", False):
2209
+ return
2210
+ if self.closed:
2211
+ return
2212
+ try:
2213
+ if self.mode == "rb":
2214
+ self.cache = None
2215
+ else:
2216
+ if not self.forced:
2217
+ self.flush(force=True)
2218
+
2219
+ if self.fs is not None:
2220
+ self.fs.invalidate_cache(self.path)
2221
+ self.fs.invalidate_cache(self.fs._parent(self.path))
2222
+ finally:
2223
+ self.closed = True
2224
+
2225
+ def readable(self):
2226
+ """Whether opened for reading"""
2227
+ return "r" in self.mode and not self.closed
2228
+
2229
+ def seekable(self):
2230
+ """Whether is seekable (only in read mode)"""
2231
+ return self.readable()
2232
+
2233
+ def writable(self):
2234
+ """Whether opened for writing"""
2235
+ return self.mode in {"wb", "ab", "xb"} and not self.closed
2236
+
2237
+ def __reduce__(self):
2238
+ if self.mode != "rb":
2239
+ raise RuntimeError("Pickling a writeable file is not supported")
2240
+
2241
+ return reopen, (
2242
+ self.fs,
2243
+ self.path,
2244
+ self.mode,
2245
+ self.blocksize,
2246
+ self.loc,
2247
+ self.size,
2248
+ self.autocommit,
2249
+ self.cache.name if self.cache else "none",
2250
+ self.kwargs,
2251
+ )
2252
+
2253
+ def __del__(self):
2254
+ if not self.closed:
2255
+ self.close()
2256
+
2257
+ def __str__(self):
2258
+ return f"<File-like object {type(self.fs).__name__}, {self.path}>"
2259
+
2260
+ __repr__ = __str__
2261
+
2262
+ def __enter__(self):
2263
+ return self
2264
+
2265
+ def __exit__(self, *args):
2266
+ self.close()
2267
+
2268
+
2269
+ def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
2270
+ file = fs.open(
2271
+ path,
2272
+ mode=mode,
2273
+ block_size=blocksize,
2274
+ autocommit=autocommit,
2275
+ cache_type=cache_type,
2276
+ size=size,
2277
+ **kwargs,
2278
+ )
2279
+ if loc > 0:
2280
+ file.seek(loc)
2281
+ return file
venv/lib/python3.10/site-packages/fsspec/transaction.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+
3
+
4
+ class Transaction:
5
+ """Filesystem transaction write context
6
+
7
+ Gathers files for deferred commit or discard, so that several write
8
+ operations can be finalized semi-atomically. This works by having this
9
+ instance as the ``.transaction`` attribute of the given filesystem
10
+ """
11
+
12
+ def __init__(self, fs, **kwargs):
13
+ """
14
+ Parameters
15
+ ----------
16
+ fs: FileSystem instance
17
+ """
18
+ self.fs = fs
19
+ self.files = deque()
20
+
21
+ def __enter__(self):
22
+ self.start()
23
+ return self
24
+
25
+ def __exit__(self, exc_type, exc_val, exc_tb):
26
+ """End transaction and commit, if exit is not due to exception"""
27
+ # only commit if there was no exception
28
+ self.complete(commit=exc_type is None)
29
+ if self.fs:
30
+ self.fs._intrans = False
31
+ self.fs._transaction = None
32
+ self.fs = None
33
+
34
+ def start(self):
35
+ """Start a transaction on this FileSystem"""
36
+ self.files = deque() # clean up after previous failed completions
37
+ self.fs._intrans = True
38
+
39
+ def complete(self, commit=True):
40
+ """Finish transaction: commit or discard all deferred files"""
41
+ while self.files:
42
+ f = self.files.popleft()
43
+ if commit:
44
+ f.commit()
45
+ else:
46
+ f.discard()
47
+ self.fs._intrans = False
48
+ self.fs._transaction = None
49
+ self.fs = None
50
+
51
+
52
+ class FileActor:
53
+ def __init__(self):
54
+ self.files = []
55
+
56
+ def commit(self):
57
+ for f in self.files:
58
+ f.commit()
59
+ self.files.clear()
60
+
61
+ def discard(self):
62
+ for f in self.files:
63
+ f.discard()
64
+ self.files.clear()
65
+
66
+ def append(self, f):
67
+ self.files.append(f)
68
+
69
+
70
+ class DaskTransaction(Transaction):
71
+ def __init__(self, fs):
72
+ """
73
+ Parameters
74
+ ----------
75
+ fs: FileSystem instance
76
+ """
77
+ import distributed
78
+
79
+ super().__init__(fs)
80
+ client = distributed.default_client()
81
+ self.files = client.submit(FileActor, actor=True).result()
82
+
83
+ def complete(self, commit=True):
84
+ """Finish transaction: commit or discard all deferred files"""
85
+ if commit:
86
+ self.files.commit().result()
87
+ else:
88
+ self.files.discard().result()
89
+ self.fs._intrans = False
90
+ self.fs = None
venv/lib/python3.10/site-packages/fsspec/utils.py ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ import math
6
+ import os
7
+ import re
8
+ import sys
9
+ import tempfile
10
+ from collections.abc import Callable, Iterable, Iterator, Sequence
11
+ from functools import partial
12
+ from hashlib import md5
13
+ from importlib.metadata import version
14
+ from typing import IO, TYPE_CHECKING, Any, TypeVar
15
+ from urllib.parse import urlsplit
16
+
17
+ if TYPE_CHECKING:
18
+ import pathlib
19
+ from typing import TypeGuard
20
+
21
+ from fsspec.spec import AbstractFileSystem
22
+
23
+
24
+ DEFAULT_BLOCK_SIZE = 5 * 2**20
25
+
26
+ T = TypeVar("T")
27
+
28
+
29
+ def infer_storage_options(
30
+ urlpath: str, inherit_storage_options: dict[str, Any] | None = None
31
+ ) -> dict[str, Any]:
32
+ """Infer storage options from URL path and merge it with existing storage
33
+ options.
34
+
35
+ Parameters
36
+ ----------
37
+ urlpath: str or unicode
38
+ Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
39
+ inherit_storage_options: dict (optional)
40
+ Its contents will get merged with the inferred information from the
41
+ given path
42
+
43
+ Returns
44
+ -------
45
+ Storage options dict.
46
+
47
+ Examples
48
+ --------
49
+ >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
50
+ {"protocol": "file", "path", "/mnt/datasets/test.csv"}
51
+ >>> infer_storage_options(
52
+ ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
53
+ ... inherit_storage_options={'extra': 'value'},
54
+ ... ) # doctest: +SKIP
55
+ {"protocol": "hdfs", "username": "username", "password": "pwd",
56
+ "host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
57
+ "url_query": "q=1", "extra": "value"}
58
+ """
59
+ # Handle Windows paths including disk name in this special case
60
+ if (
61
+ re.match(r"^[a-zA-Z]:[\\/]", urlpath)
62
+ or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
63
+ ):
64
+ return {"protocol": "file", "path": urlpath}
65
+
66
+ parsed_path = urlsplit(urlpath)
67
+ protocol = parsed_path.scheme or "file"
68
+ if parsed_path.fragment:
69
+ path = "#".join([parsed_path.path, parsed_path.fragment])
70
+ else:
71
+ path = parsed_path.path
72
+ if protocol == "file":
73
+ # Special case parsing file protocol URL on Windows according to:
74
+ # https://msdn.microsoft.com/en-us/library/jj710207.aspx
75
+ windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
76
+ if windows_path:
77
+ drive, path = windows_path.groups()
78
+ path = f"{drive}:{path}"
79
+
80
+ if protocol in ["http", "https"]:
81
+ # for HTTP, we don't want to parse, as requests will anyway
82
+ return {"protocol": protocol, "path": urlpath}
83
+
84
+ options: dict[str, Any] = {"protocol": protocol, "path": path}
85
+
86
+ if parsed_path.netloc:
87
+ # Parse `hostname` from netloc manually because `parsed_path.hostname`
88
+ # lowercases the hostname which is not always desirable (e.g. in S3):
89
+ # https://github.com/dask/dask/issues/1417
90
+ options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
91
+
92
+ if protocol in ("s3", "s3a", "gcs", "gs"):
93
+ options["path"] = options["host"] + options["path"]
94
+ else:
95
+ options["host"] = options["host"]
96
+ if parsed_path.port:
97
+ options["port"] = parsed_path.port
98
+ if parsed_path.username:
99
+ options["username"] = parsed_path.username
100
+ if parsed_path.password:
101
+ options["password"] = parsed_path.password
102
+
103
+ if parsed_path.query:
104
+ options["url_query"] = parsed_path.query
105
+ if parsed_path.fragment:
106
+ options["url_fragment"] = parsed_path.fragment
107
+
108
+ if inherit_storage_options:
109
+ update_storage_options(options, inherit_storage_options)
110
+
111
+ return options
112
+
113
+
114
+ def update_storage_options(
115
+ options: dict[str, Any], inherited: dict[str, Any] | None = None
116
+ ) -> None:
117
+ if not inherited:
118
+ inherited = {}
119
+ collisions = set(options) & set(inherited)
120
+ if collisions:
121
+ for collision in collisions:
122
+ if options.get(collision) != inherited.get(collision):
123
+ raise KeyError(
124
+ f"Collision between inferred and specified storage "
125
+ f"option:\n{collision}"
126
+ )
127
+ options.update(inherited)
128
+
129
+
130
+ # Compression extensions registered via fsspec.compression.register_compression
131
+ compressions: dict[str, str] = {}
132
+
133
+
134
+ def infer_compression(filename: str) -> str | None:
135
+ """Infer compression, if available, from filename.
136
+
137
+ Infer a named compression type, if registered and available, from filename
138
+ extension. This includes builtin (gz, bz2, zip) compressions, as well as
139
+ optional compressions. See fsspec.compression.register_compression.
140
+ """
141
+ extension = os.path.splitext(filename)[-1].strip(".").lower()
142
+ if extension in compressions:
143
+ return compressions[extension]
144
+ return None
145
+
146
+
147
+ def build_name_function(max_int: float) -> Callable[[int], str]:
148
+ """Returns a function that receives a single integer
149
+ and returns it as a string padded by enough zero characters
150
+ to align with maximum possible integer
151
+
152
+ >>> name_f = build_name_function(57)
153
+
154
+ >>> name_f(7)
155
+ '07'
156
+ >>> name_f(31)
157
+ '31'
158
+ >>> build_name_function(1000)(42)
159
+ '0042'
160
+ >>> build_name_function(999)(42)
161
+ '042'
162
+ >>> build_name_function(0)(0)
163
+ '0'
164
+ """
165
+ # handle corner cases max_int is 0 or exact power of 10
166
+ max_int += 1e-8
167
+
168
+ pad_length = int(math.ceil(math.log10(max_int)))
169
+
170
+ def name_function(i: int) -> str:
171
+ return str(i).zfill(pad_length)
172
+
173
+ return name_function
174
+
175
+
176
+ def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
177
+ r"""Seek current file to file start, file end, or byte after delimiter seq.
178
+
179
+ Seeks file to next chunk delimiter, where chunks are defined on file start,
180
+ a delimiting sequence, and file end. Use file.tell() to see location afterwards.
181
+ Note that file start is a valid split, so must be at offset > 0 to seek for
182
+ delimiter.
183
+
184
+ Parameters
185
+ ----------
186
+ file: a file
187
+ delimiter: bytes
188
+ a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
189
+ blocksize: int
190
+ Number of bytes to read from the file at once.
191
+
192
+
193
+ Returns
194
+ -------
195
+ Returns True if a delimiter was found, False if at file start or end.
196
+
197
+ """
198
+
199
+ if file.tell() == 0:
200
+ # beginning-of-file, return without seek
201
+ return False
202
+
203
+ # Interface is for binary IO, with delimiter as bytes, but initialize last
204
+ # with result of file.read to preserve compatibility with text IO.
205
+ last: bytes | None = None
206
+ while True:
207
+ current = file.read(blocksize)
208
+ if not current:
209
+ # end-of-file without delimiter
210
+ return False
211
+ full = last + current if last else current
212
+ try:
213
+ if delimiter in full:
214
+ i = full.index(delimiter)
215
+ file.seek(file.tell() - (len(full) - i) + len(delimiter))
216
+ return True
217
+ elif len(current) < blocksize:
218
+ # end-of-file without delimiter
219
+ return False
220
+ except (OSError, ValueError):
221
+ pass
222
+ last = full[-len(delimiter) :]
223
+
224
+
225
+ def read_block(
226
+ f: IO[bytes],
227
+ offset: int,
228
+ length: int | None,
229
+ delimiter: bytes | None = None,
230
+ split_before: bool = False,
231
+ ) -> bytes:
232
+ """Read a block of bytes from a file
233
+
234
+ Parameters
235
+ ----------
236
+ f: File
237
+ Open file
238
+ offset: int
239
+ Byte offset to start read
240
+ length: int
241
+ Number of bytes to read, read through end of file if None
242
+ delimiter: bytes (optional)
243
+ Ensure reading starts and stops at delimiter bytestring
244
+ split_before: bool (optional)
245
+ Start/stop read *before* delimiter bytestring.
246
+
247
+
248
+ If using the ``delimiter=`` keyword argument we ensure that the read
249
+ starts and stops at delimiter boundaries that follow the locations
250
+ ``offset`` and ``offset + length``. If ``offset`` is zero then we
251
+ start at zero, regardless of delimiter. The bytestring returned WILL
252
+ include the terminating delimiter string.
253
+
254
+ Examples
255
+ --------
256
+
257
+ >>> from io import BytesIO # doctest: +SKIP
258
+ >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
259
+ >>> read_block(f, 0, 13) # doctest: +SKIP
260
+ b'Alice, 100\\nBo'
261
+
262
+ >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
263
+ b'Alice, 100\\nBob, 200\\n'
264
+
265
+ >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
266
+ b'Bob, 200\\nCharlie, 300'
267
+ """
268
+ if delimiter:
269
+ f.seek(offset)
270
+ found_start_delim = seek_delimiter(f, delimiter, 2**16)
271
+ if length is None:
272
+ return f.read()
273
+ start = f.tell()
274
+ length -= start - offset
275
+
276
+ f.seek(start + length)
277
+ found_end_delim = seek_delimiter(f, delimiter, 2**16)
278
+ end = f.tell()
279
+
280
+ # Adjust split location to before delimiter if seek found the
281
+ # delimiter sequence, not start or end of file.
282
+ if found_start_delim and split_before:
283
+ start -= len(delimiter)
284
+
285
+ if found_end_delim and split_before:
286
+ end -= len(delimiter)
287
+
288
+ offset = start
289
+ length = end - start
290
+
291
+ f.seek(offset)
292
+
293
+ # TODO: allow length to be None and read to the end of the file?
294
+ assert length is not None
295
+ b = f.read(length)
296
+ return b
297
+
298
+
299
+ def tokenize(*args: Any, **kwargs: Any) -> str:
300
+ """Deterministic token
301
+
302
+ (modified from dask.base)
303
+
304
+ >>> tokenize([1, 2, '3'])
305
+ '9d71491b50023b06fc76928e6eddb952'
306
+
307
+ >>> tokenize('Hello') == tokenize('Hello')
308
+ True
309
+ """
310
+ if kwargs:
311
+ args += (kwargs,)
312
+ try:
313
+ h = md5(str(args).encode())
314
+ except ValueError:
315
+ # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
316
+ h = md5(str(args).encode(), usedforsecurity=False)
317
+ return h.hexdigest()
318
+
319
+
320
+ def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
321
+ """Attempt to convert a path-like object to a string.
322
+
323
+ Parameters
324
+ ----------
325
+ filepath: object to be converted
326
+
327
+ Returns
328
+ -------
329
+ filepath_str: maybe a string version of the object
330
+
331
+ Notes
332
+ -----
333
+ Objects supporting the fspath protocol are coerced according to its
334
+ __fspath__ method.
335
+
336
+ For backwards compatibility with older Python version, pathlib.Path
337
+ objects are specially coerced.
338
+
339
+ Any other object is passed through unchanged, which includes bytes,
340
+ strings, buffers, or anything else that's not even path-like.
341
+ """
342
+ if isinstance(filepath, str):
343
+ return filepath
344
+ elif hasattr(filepath, "__fspath__"):
345
+ return filepath.__fspath__()
346
+ elif hasattr(filepath, "path"):
347
+ return filepath.path
348
+ else:
349
+ return filepath # type: ignore[return-value]
350
+
351
+
352
+ def make_instance(
353
+ cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
354
+ ) -> T:
355
+ inst = cls(*args, **kwargs)
356
+ inst._determine_worker() # type: ignore[attr-defined]
357
+ return inst
358
+
359
+
360
+ def common_prefix(paths: Iterable[str]) -> str:
361
+ """For a list of paths, find the shortest prefix common to all"""
362
+ parts = [p.split("/") for p in paths]
363
+ lmax = min(len(p) for p in parts)
364
+ end = 0
365
+ for i in range(lmax):
366
+ end = all(p[i] == parts[0][i] for p in parts)
367
+ if not end:
368
+ break
369
+ i += end
370
+ return "/".join(parts[0][:i])
371
+
372
+
373
+ def other_paths(
374
+ paths: list[str],
375
+ path2: str | list[str],
376
+ exists: bool = False,
377
+ flatten: bool = False,
378
+ ) -> list[str]:
379
+ """In bulk file operations, construct a new file tree from a list of files
380
+
381
+ Parameters
382
+ ----------
383
+ paths: list of str
384
+ The input file tree
385
+ path2: str or list of str
386
+ Root to construct the new list in. If this is already a list of str, we just
387
+ assert it has the right number of elements.
388
+ exists: bool (optional)
389
+ For a str destination, it is already exists (and is a dir), files should
390
+ end up inside.
391
+ flatten: bool (optional)
392
+ Whether to flatten the input directory tree structure so that the output files
393
+ are in the same directory.
394
+
395
+ Returns
396
+ -------
397
+ list of str
398
+ """
399
+
400
+ if isinstance(path2, str):
401
+ path2 = path2.rstrip("/")
402
+
403
+ if flatten:
404
+ path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
405
+ else:
406
+ cp = common_prefix(paths)
407
+ if exists:
408
+ cp = cp.rsplit("/", 1)[0]
409
+ if not cp and all(not s.startswith("/") for s in paths):
410
+ path2 = ["/".join([path2, p]) for p in paths]
411
+ else:
412
+ path2 = [p.replace(cp, path2, 1) for p in paths]
413
+ else:
414
+ assert len(paths) == len(path2)
415
+ return path2
416
+
417
+
418
+ def is_exception(obj: Any) -> bool:
419
+ return isinstance(obj, BaseException)
420
+
421
+
422
+ def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
423
+ return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
424
+
425
+
426
+ def get_protocol(url: str) -> str:
427
+ url = stringify_path(url)
428
+ parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
429
+ if len(parts) > 1:
430
+ return parts[0]
431
+ return "file"
432
+
433
+
434
+ def get_file_extension(url: str) -> str:
435
+ url = stringify_path(url)
436
+ ext_parts = url.rsplit(".", 1)
437
+ if len(ext_parts) > 1:
438
+ return ext_parts[-1]
439
+ return ""
440
+
441
+
442
+ def can_be_local(path: str) -> bool:
443
+ """Can the given URL be used with open_local?"""
444
+ from fsspec import get_filesystem_class
445
+
446
+ try:
447
+ return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
448
+ except (ValueError, ImportError):
449
+ # not in registry or import failed
450
+ return False
451
+
452
+
453
+ def get_package_version_without_import(name: str) -> str | None:
454
+ """For given package name, try to find the version without importing it
455
+
456
+ Import and package.__version__ is still the backup here, so an import
457
+ *might* happen.
458
+
459
+ Returns either the version string, or None if the package
460
+ or the version was not readily found.
461
+ """
462
+ if name in sys.modules:
463
+ mod = sys.modules[name]
464
+ if hasattr(mod, "__version__"):
465
+ return mod.__version__
466
+ try:
467
+ return version(name)
468
+ except: # noqa: E722
469
+ pass
470
+ try:
471
+ import importlib
472
+
473
+ mod = importlib.import_module(name)
474
+ return mod.__version__
475
+ except (ImportError, AttributeError):
476
+ return None
477
+
478
+
479
+ def setup_logging(
480
+ logger: logging.Logger | None = None,
481
+ logger_name: str | None = None,
482
+ level: str = "DEBUG",
483
+ clear: bool = True,
484
+ ) -> logging.Logger:
485
+ if logger is None and logger_name is None:
486
+ raise ValueError("Provide either logger object or logger name")
487
+ logger = logger or logging.getLogger(logger_name)
488
+ handle = logging.StreamHandler()
489
+ formatter = logging.Formatter(
490
+ "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
491
+ )
492
+ handle.setFormatter(formatter)
493
+ if clear:
494
+ logger.handlers.clear()
495
+ logger.addHandler(handle)
496
+ logger.setLevel(level)
497
+ return logger
498
+
499
+
500
+ def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
501
+ return fs.unstrip_protocol(name)
502
+
503
+
504
+ def mirror_from(
505
+ origin_name: str, methods: Iterable[str]
506
+ ) -> Callable[[type[T]], type[T]]:
507
+ """Mirror attributes and methods from the given
508
+ origin_name attribute of the instance to the
509
+ decorated class"""
510
+
511
+ def origin_getter(method: str, self: Any) -> Any:
512
+ origin = getattr(self, origin_name)
513
+ return getattr(origin, method)
514
+
515
+ def wrapper(cls: type[T]) -> type[T]:
516
+ for method in methods:
517
+ wrapped_method = partial(origin_getter, method)
518
+ setattr(cls, method, property(wrapped_method))
519
+ return cls
520
+
521
+ return wrapper
522
+
523
+
524
+ @contextlib.contextmanager
525
+ def nullcontext(obj: T) -> Iterator[T]:
526
+ yield obj
527
+
528
+
529
+ def merge_offset_ranges(
530
+ paths: list[str],
531
+ starts: list[int] | int,
532
+ ends: list[int] | int,
533
+ max_gap: int = 0,
534
+ max_block: int | None = None,
535
+ sort: bool = True,
536
+ ) -> tuple[list[str], list[int], list[int]]:
537
+ """Merge adjacent byte-offset ranges when the inter-range
538
+ gap is <= `max_gap`, and when the merged byte range does not
539
+ exceed `max_block` (if specified). By default, this function
540
+ will re-order the input paths and byte ranges to ensure sorted
541
+ order. If the user can guarantee that the inputs are already
542
+ sorted, passing `sort=False` will skip the re-ordering.
543
+ """
544
+ # Check input
545
+ if not isinstance(paths, list):
546
+ raise TypeError
547
+ if not isinstance(starts, list):
548
+ starts = [starts] * len(paths)
549
+ if not isinstance(ends, list):
550
+ ends = [ends] * len(paths)
551
+ if len(starts) != len(paths) or len(ends) != len(paths):
552
+ raise ValueError
553
+
554
+ # Early Return
555
+ if len(starts) <= 1:
556
+ return paths, starts, ends
557
+
558
+ starts = [s or 0 for s in starts]
559
+ # Sort by paths and then ranges if `sort=True`
560
+ if sort:
561
+ paths, starts, ends = (
562
+ list(v)
563
+ for v in zip(
564
+ *sorted(
565
+ zip(paths, starts, ends),
566
+ )
567
+ )
568
+ )
569
+ remove = []
570
+ for i, (path, start, end) in enumerate(zip(paths, starts, ends)):
571
+ if any(
572
+ e is not None and p == path and start >= s and end <= e and i != i2
573
+ for i2, (p, s, e) in enumerate(zip(paths, starts, ends))
574
+ ):
575
+ remove.append(i)
576
+ paths = [p for i, p in enumerate(paths) if i not in remove]
577
+ starts = [s for i, s in enumerate(starts) if i not in remove]
578
+ ends = [e for i, e in enumerate(ends) if i not in remove]
579
+
580
+ if paths:
581
+ # Loop through the coupled `paths`, `starts`, and
582
+ # `ends`, and merge adjacent blocks when appropriate
583
+ new_paths = paths[:1]
584
+ new_starts = starts[:1]
585
+ new_ends = ends[:1]
586
+ for i in range(1, len(paths)):
587
+ if paths[i] == paths[i - 1] and new_ends[-1] is None:
588
+ continue
589
+ elif (
590
+ paths[i] != paths[i - 1]
591
+ or ((starts[i] - new_ends[-1]) > max_gap)
592
+ or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
593
+ ):
594
+ # Cannot merge with previous block.
595
+ # Add new `paths`, `starts`, and `ends` elements
596
+ new_paths.append(paths[i])
597
+ new_starts.append(starts[i])
598
+ new_ends.append(ends[i])
599
+ else:
600
+ # Merge with the previous block by updating the
601
+ # last element of `ends`
602
+ new_ends[-1] = ends[i]
603
+ return new_paths, new_starts, new_ends
604
+
605
+ # `paths` is empty. Just return input lists
606
+ return paths, starts, ends
607
+
608
+
609
+ def file_size(filelike: IO[bytes]) -> int:
610
+ """Find length of any open read-mode file-like"""
611
+ pos = filelike.tell()
612
+ try:
613
+ return filelike.seek(0, 2)
614
+ finally:
615
+ filelike.seek(pos)
616
+
617
+
618
+ @contextlib.contextmanager
619
+ def atomic_write(path: str, mode: str = "wb"):
620
+ """
621
+ A context manager that opens a temporary file next to `path` and, on exit,
622
+ replaces `path` with the temporary file, thereby updating `path`
623
+ atomically.
624
+ """
625
+ fd, fn = tempfile.mkstemp(
626
+ dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
627
+ )
628
+ try:
629
+ with open(fd, mode) as fp:
630
+ yield fp
631
+ except BaseException:
632
+ with contextlib.suppress(FileNotFoundError):
633
+ os.unlink(fn)
634
+ raise
635
+ else:
636
+ os.replace(fn, path)
637
+
638
+
639
+ def _translate(pat, STAR, QUESTION_MARK):
640
+ # Copied from: https://github.com/python/cpython/pull/106703.
641
+ res: list[str] = []
642
+ add = res.append
643
+ i, n = 0, len(pat)
644
+ while i < n:
645
+ c = pat[i]
646
+ i = i + 1
647
+ if c == "*":
648
+ # compress consecutive `*` into one
649
+ if (not res) or res[-1] is not STAR:
650
+ add(STAR)
651
+ elif c == "?":
652
+ add(QUESTION_MARK)
653
+ elif c == "[":
654
+ j = i
655
+ if j < n and pat[j] == "!":
656
+ j = j + 1
657
+ if j < n and pat[j] == "]":
658
+ j = j + 1
659
+ while j < n and pat[j] != "]":
660
+ j = j + 1
661
+ if j >= n:
662
+ add("\\[")
663
+ else:
664
+ stuff = pat[i:j]
665
+ if "-" not in stuff:
666
+ stuff = stuff.replace("\\", r"\\")
667
+ else:
668
+ chunks = []
669
+ k = i + 2 if pat[i] == "!" else i + 1
670
+ while True:
671
+ k = pat.find("-", k, j)
672
+ if k < 0:
673
+ break
674
+ chunks.append(pat[i:k])
675
+ i = k + 1
676
+ k = k + 3
677
+ chunk = pat[i:j]
678
+ if chunk:
679
+ chunks.append(chunk)
680
+ else:
681
+ chunks[-1] += "-"
682
+ # Remove empty ranges -- invalid in RE.
683
+ for k in range(len(chunks) - 1, 0, -1):
684
+ if chunks[k - 1][-1] > chunks[k][0]:
685
+ chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
686
+ del chunks[k]
687
+ # Escape backslashes and hyphens for set difference (--).
688
+ # Hyphens that create ranges shouldn't be escaped.
689
+ stuff = "-".join(
690
+ s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
691
+ )
692
+ # Escape set operations (&&, ~~ and ||).
693
+ stuff = re.sub(r"([&~|])", r"\\\1", stuff)
694
+ i = j + 1
695
+ if not stuff:
696
+ # Empty range: never match.
697
+ add("(?!)")
698
+ elif stuff == "!":
699
+ # Negated empty range: match any character.
700
+ add(".")
701
+ else:
702
+ if stuff[0] == "!":
703
+ stuff = "^" + stuff[1:]
704
+ elif stuff[0] in ("^", "["):
705
+ stuff = "\\" + stuff
706
+ add(f"[{stuff}]")
707
+ else:
708
+ add(re.escape(c))
709
+ assert i == n
710
+ return res
711
+
712
+
713
+ def glob_translate(pat):
714
+ # Copied from: https://github.com/python/cpython/pull/106703.
715
+ # The keyword parameters' values are fixed to:
716
+ # recursive=True, include_hidden=True, seps=None
717
+ """Translate a pathname with shell wildcards to a regular expression."""
718
+ if os.path.altsep:
719
+ seps = os.path.sep + os.path.altsep
720
+ else:
721
+ seps = os.path.sep
722
+ escaped_seps = "".join(map(re.escape, seps))
723
+ any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
724
+ not_sep = f"[^{escaped_seps}]"
725
+ one_last_segment = f"{not_sep}+"
726
+ one_segment = f"{one_last_segment}{any_sep}"
727
+ any_segments = f"(?:.+{any_sep})?"
728
+ any_last_segments = ".*"
729
+ results = []
730
+ parts = re.split(any_sep, pat)
731
+ last_part_idx = len(parts) - 1
732
+ for idx, part in enumerate(parts):
733
+ if part == "*":
734
+ results.append(one_segment if idx < last_part_idx else one_last_segment)
735
+ continue
736
+ if part == "**":
737
+ results.append(any_segments if idx < last_part_idx else any_last_segments)
738
+ continue
739
+ elif "**" in part:
740
+ raise ValueError(
741
+ "Invalid pattern: '**' can only be an entire path component"
742
+ )
743
+ if part:
744
+ results.extend(_translate(part, f"{not_sep}*", not_sep))
745
+ if idx < last_part_idx:
746
+ results.append(any_sep)
747
+ res = "".join(results)
748
+ return rf"(?s:{res})\Z"
venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/METADATA ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: httpcore
3
+ Version: 1.0.9
4
+ Summary: A minimal low-level HTTP client.
5
+ Project-URL: Documentation, https://www.encode.io/httpcore
6
+ Project-URL: Homepage, https://www.encode.io/httpcore/
7
+ Project-URL: Source, https://github.com/encode/httpcore
8
+ Author-email: Tom Christie <tom@tomchristie.com>
9
+ License-Expression: BSD-3-Clause
10
+ License-File: LICENSE.md
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Web Environment
13
+ Classifier: Framework :: AsyncIO
14
+ Classifier: Framework :: Trio
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: BSD License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Topic :: Internet :: WWW/HTTP
26
+ Requires-Python: >=3.8
27
+ Requires-Dist: certifi
28
+ Requires-Dist: h11>=0.16
29
+ Provides-Extra: asyncio
30
+ Requires-Dist: anyio<5.0,>=4.0; extra == 'asyncio'
31
+ Provides-Extra: http2
32
+ Requires-Dist: h2<5,>=3; extra == 'http2'
33
+ Provides-Extra: socks
34
+ Requires-Dist: socksio==1.*; extra == 'socks'
35
+ Provides-Extra: trio
36
+ Requires-Dist: trio<1.0,>=0.22.0; extra == 'trio'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # HTTP Core
40
+
41
+ [![Test Suite](https://github.com/encode/httpcore/workflows/Test%20Suite/badge.svg)](https://github.com/encode/httpcore/actions)
42
+ [![Package version](https://badge.fury.io/py/httpcore.svg)](https://pypi.org/project/httpcore/)
43
+
44
+ > *Do one thing, and do it well.*
45
+
46
+ The HTTP Core package provides a minimal low-level HTTP client, which does
47
+ one thing only. Sending HTTP requests.
48
+
49
+ It does not provide any high level model abstractions over the API,
50
+ does not handle redirects, multipart uploads, building authentication headers,
51
+ transparent HTTP caching, URL parsing, session cookie handling,
52
+ content or charset decoding, handling JSON, environment based configuration
53
+ defaults, or any of that Jazz.
54
+
55
+ Some things HTTP Core does do:
56
+
57
+ * Sending HTTP requests.
58
+ * Thread-safe / task-safe connection pooling.
59
+ * HTTP(S) proxy & SOCKS proxy support.
60
+ * Supports HTTP/1.1 and HTTP/2.
61
+ * Provides both sync and async interfaces.
62
+ * Async backend support for `asyncio` and `trio`.
63
+
64
+ ## Requirements
65
+
66
+ Python 3.8+
67
+
68
+ ## Installation
69
+
70
+ For HTTP/1.1 only support, install with:
71
+
72
+ ```shell
73
+ $ pip install httpcore
74
+ ```
75
+
76
+ There are also a number of optional extras available...
77
+
78
+ ```shell
79
+ $ pip install httpcore['asyncio,trio,http2,socks']
80
+ ```
81
+
82
+ ## Sending requests
83
+
84
+ Send an HTTP request:
85
+
86
+ ```python
87
+ import httpcore
88
+
89
+ response = httpcore.request("GET", "https://www.example.com/")
90
+
91
+ print(response)
92
+ # <Response [200]>
93
+ print(response.status)
94
+ # 200
95
+ print(response.headers)
96
+ # [(b'Accept-Ranges', b'bytes'), (b'Age', b'557328'), (b'Cache-Control', b'max-age=604800'), ...]
97
+ print(response.content)
98
+ # b'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>\n\n<meta charset="utf-8"/>\n ...'
99
+ ```
100
+
101
+ The top-level `httpcore.request()` function is provided for convenience. In practice whenever you're working with `httpcore` you'll want to use the connection pooling functionality that it provides.
102
+
103
+ ```python
104
+ import httpcore
105
+
106
+ http = httpcore.ConnectionPool()
107
+ response = http.request("GET", "https://www.example.com/")
108
+ ```
109
+
110
+ Once you're ready to get going, [head over to the documentation](https://www.encode.io/httpcore/).
111
+
112
+ ## Motivation
113
+
114
+ You *probably* don't want to be using HTTP Core directly. It might make sense if
115
+ you're writing something like a proxy service in Python, and you just want
116
+ something at the lowest possible level, but more typically you'll want to use
117
+ a higher level client library, such as `httpx`.
118
+
119
+ The motivation for `httpcore` is:
120
+
121
+ * To provide a reusable low-level client library, that other packages can then build on top of.
122
+ * To provide a *really clear interface split* between the networking code and client logic,
123
+ so that each is easier to understand and reason about in isolation.
124
+
125
+ ## Dependencies
126
+
127
+ The `httpcore` package has the following dependencies...
128
+
129
+ * `h11`
130
+ * `certifi`
131
+
132
+ And the following optional extras...
133
+
134
+ * `anyio` - Required by `pip install httpcore['asyncio']`.
135
+ * `trio` - Required by `pip install httpcore['trio']`.
136
+ * `h2` - Required by `pip install httpcore['http2']`.
137
+ * `socksio` - Required by `pip install httpcore['socks']`.
138
+
139
+ ## Versioning
140
+
141
+ We use [SEMVER for our versioning policy](https://semver.org/).
142
+
143
+ For changes between package versions please see our [project changelog](CHANGELOG.md).
144
+
145
+ We recommend pinning your requirements either the most current major version, or a more specific version range:
146
+
147
+ ```python
148
+ pip install 'httpcore==1.*'
149
+ ```
150
+ # Changelog
151
+
152
+ All notable changes to this project will be documented in this file.
153
+
154
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
155
+
156
+ ## Version 1.0.9 (April 24th, 2025)
157
+
158
+ - Resolve https://github.com/advisories/GHSA-vqfr-h8mv-ghfj with h11 dependency update. (#1008)
159
+
160
+ ## Version 1.0.8 (April 11th, 2025)
161
+
162
+ - Fix `AttributeError` when importing on Python 3.14. (#1005)
163
+
164
+ ## Version 1.0.7 (November 15th, 2024)
165
+
166
+ - Support `proxy=…` configuration on `ConnectionPool()`. (#974)
167
+
168
+ ## Version 1.0.6 (October 1st, 2024)
169
+
170
+ - Relax `trio` dependency pinning. (#956)
171
+ - Handle `trio` raising `NotImplementedError` on unsupported platforms. (#955)
172
+ - Handle mapping `ssl.SSLError` to `httpcore.ConnectError`. (#918)
173
+
174
+ ## 1.0.5 (March 27th, 2024)
175
+
176
+ - Handle `EndOfStream` exception for anyio backend. (#899)
177
+ - Allow trio `0.25.*` series in package dependancies. (#903)
178
+
179
+ ## 1.0.4 (February 21st, 2024)
180
+
181
+ - Add `target` request extension. (#888)
182
+ - Fix support for connection `Upgrade` and `CONNECT` when some data in the stream has been read. (#882)
183
+
184
+ ## 1.0.3 (February 13th, 2024)
185
+
186
+ - Fix support for async cancellations. (#880)
187
+ - Fix trace extension when used with socks proxy. (#849)
188
+ - Fix SSL context for connections using the "wss" scheme (#869)
189
+
190
+ ## 1.0.2 (November 10th, 2023)
191
+
192
+ - Fix `float("inf")` timeouts in `Event.wait` function. (#846)
193
+
194
+ ## 1.0.1 (November 3rd, 2023)
195
+
196
+ - Fix pool timeout to account for the total time spent retrying. (#823)
197
+ - Raise a neater RuntimeError when the correct async deps are not installed. (#826)
198
+ - Add support for synchronous TLS-in-TLS streams. (#840)
199
+
200
+ ## 1.0.0 (October 6th, 2023)
201
+
202
+ From version 1.0 our async support is now optional, as the package has minimal dependencies by default.
203
+
204
+ For async support use either `pip install 'httpcore[asyncio]'` or `pip install 'httpcore[trio]'`.
205
+
206
+ The project versioning policy is now explicitly governed by SEMVER. See https://semver.org/.
207
+
208
+ - Async support becomes fully optional. (#809)
209
+ - Add support for Python 3.12. (#807)
210
+
211
+ ## 0.18.0 (September 8th, 2023)
212
+
213
+ - Add support for HTTPS proxies. (#745, #786)
214
+ - Drop Python 3.7 support. (#727)
215
+ - Handle `sni_hostname` extension with SOCKS proxy. (#774)
216
+ - Handle HTTP/1.1 half-closed connections gracefully. (#641)
217
+ - Change the type of `Extensions` from `Mapping[Str, Any]` to `MutableMapping[Str, Any]`. (#762)
218
+
219
+ ## 0.17.3 (July 5th, 2023)
220
+
221
+ - Support async cancellations, ensuring that the connection pool is left in a clean state when cancellations occur. (#726)
222
+ - The networking backend interface has [been added to the public API](https://www.encode.io/httpcore/network-backends). Some classes which were previously private implementation detail are now part of the top-level public API. (#699)
223
+ - Graceful handling of HTTP/2 GoAway frames, with requests being transparently retried on a new connection. (#730)
224
+ - Add exceptions when a synchronous `trace callback` is passed to an asynchronous request or an asynchronous `trace callback` is passed to a synchronous request. (#717)
225
+ - Drop Python 3.7 support. (#727)
226
+
227
+ ## 0.17.2 (May 23th, 2023)
228
+
229
+ - Add `socket_options` argument to `ConnectionPool` and `HTTProxy` classes. (#668)
230
+ - Improve logging with per-module logger names. (#690)
231
+ - Add `sni_hostname` request extension. (#696)
232
+ - Resolve race condition during import of `anyio` package. (#692)
233
+ - Enable TCP_NODELAY for all synchronous sockets. (#651)
234
+
235
+ ## 0.17.1 (May 17th, 2023)
236
+
237
+ - If 'retries' is set, then allow retries if an SSL handshake error occurs. (#669)
238
+ - Improve correctness of tracebacks on network exceptions, by raising properly chained exceptions. (#678)
239
+ - Prevent connection-hanging behaviour when HTTP/2 connections are closed by a server-sent 'GoAway' frame. (#679)
240
+ - Fix edge-case exception when removing requests from the connection pool. (#680)
241
+ - Fix pool timeout edge-case. (#688)
242
+
243
+ ## 0.17.0 (March 16th, 2023)
244
+
245
+ - Add DEBUG level logging. (#648)
246
+ - Respect HTTP/2 max concurrent streams when settings updates are sent by server. (#652)
247
+ - Increase the allowable HTTP header size to 100kB. (#647)
248
+ - Add `retries` option to SOCKS proxy classes. (#643)
249
+
250
+ ## 0.16.3 (December 20th, 2022)
251
+
252
+ - Allow `ws` and `wss` schemes. Allows us to properly support websocket upgrade connections. (#625)
253
+ - Forwarding HTTP proxies use a connection-per-remote-host. Required by some proxy implementations. (#637)
254
+ - Don't raise `RuntimeError` when closing a connection pool with active connections. Removes some error cases when cancellations are used. (#631)
255
+ - Lazy import `anyio`, so that it's no longer a hard dependancy, and isn't imported if unused. (#639)
256
+
257
+ ## 0.16.2 (November 25th, 2022)
258
+
259
+ - Revert 'Fix async cancellation behaviour', which introduced race conditions. (#627)
260
+ - Raise `RuntimeError` if attempting to us UNIX domain sockets on Windows. (#619)
261
+
262
+ ## 0.16.1 (November 17th, 2022)
263
+
264
+ - Fix HTTP/1.1 interim informational responses, such as "100 Continue". (#605)
265
+
266
+ ## 0.16.0 (October 11th, 2022)
267
+
268
+ - Support HTTP/1.1 informational responses. (#581)
269
+ - Fix async cancellation behaviour. (#580)
270
+ - Support `h11` 0.14. (#579)
271
+
272
+ ## 0.15.0 (May 17th, 2022)
273
+
274
+ - Drop Python 3.6 support (#535)
275
+ - Ensure HTTP proxy CONNECT requests include `timeout` configuration. (#506)
276
+ - Switch to explicit `typing.Optional` for type hints. (#513)
277
+ - For `trio` map OSError exceptions to `ConnectError`. (#543)
278
+
279
+ ## 0.14.7 (February 4th, 2022)
280
+
281
+ - Requests which raise a PoolTimeout need to be removed from the pool queue. (#502)
282
+ - Fix AttributeError that happened when Socks5Connection were terminated. (#501)
283
+
284
+ ## 0.14.6 (February 1st, 2022)
285
+
286
+ - Fix SOCKS support for `http://` URLs. (#492)
287
+ - Resolve race condition around exceptions during streaming a response. (#491)
288
+
289
+ ## 0.14.5 (January 18th, 2022)
290
+
291
+ - SOCKS proxy support. (#478)
292
+ - Add proxy_auth argument to HTTPProxy. (#481)
293
+ - Improve error message on 'RemoteProtocolError' exception when server disconnects without sending a response. (#479)
294
+
295
+ ## 0.14.4 (January 5th, 2022)
296
+
297
+ - Support HTTP/2 on HTTPS tunnelling proxies. (#468)
298
+ - Fix proxy headers missing on HTTP forwarding. (#456)
299
+ - Only instantiate SSL context if required. (#457)
300
+ - More robust HTTP/2 handling. (#253, #439, #440, #441)
301
+
302
+ ## 0.14.3 (November 17th, 2021)
303
+
304
+ - Fix race condition when removing closed connections from the pool. (#437)
305
+
306
+ ## 0.14.2 (November 16th, 2021)
307
+
308
+ - Failed connections no longer remain in the pool. (Pull #433)
309
+
310
+ ## 0.14.1 (November 12th, 2021)
311
+
312
+ - `max_connections` becomes optional. (Pull #429)
313
+ - `certifi` is now included in the install dependancies. (Pull #428)
314
+ - `h2` is now strictly optional. (Pull #428)
315
+
316
+ ## 0.14.0 (November 11th, 2021)
317
+
318
+ The 0.14 release is a complete reworking of `httpcore`, comprehensively addressing some underlying issues in the connection pooling, as well as substantially redesigning the API to be more user friendly.
319
+
320
+ Some of the lower-level API design also makes the components more easily testable in isolation, and the package now has 100% test coverage.
321
+
322
+ See [discussion #419](https://github.com/encode/httpcore/discussions/419) for a little more background.
323
+
324
+ There's some other neat bits in there too, such as the "trace" extension, which gives a hook into inspecting the internal events that occur during the request/response cycle. This extension is needed for the HTTPX cli, in order to...
325
+
326
+ * Log the point at which the connection is established, and the IP/port on which it is made.
327
+ * Determine if the outgoing request should log as HTTP/1.1 or HTTP/2, rather than having to assume it's HTTP/2 if the --http2 flag was passed. (Which may not actually be true.)
328
+ * Log SSL version info / certificate info.
329
+
330
+ Note that `curio` support is not currently available in 0.14.0. If you're using `httpcore` with `curio` please get in touch, so we can assess if we ought to prioritize it as a feature or not.
331
+
332
+ ## 0.13.7 (September 13th, 2021)
333
+
334
+ - Fix broken error messaging when URL scheme is missing, or a non HTTP(S) scheme is used. (Pull #403)
335
+
336
+ ## 0.13.6 (June 15th, 2021)
337
+
338
+ ### Fixed
339
+
340
+ - Close sockets when read or write timeouts occur. (Pull #365)
341
+
342
+ ## 0.13.5 (June 14th, 2021)
343
+
344
+ ### Fixed
345
+
346
+ - Resolved niggles with AnyIO EOF behaviours. (Pull #358, #362)
347
+
348
+ ## 0.13.4 (June 9th, 2021)
349
+
350
+ ### Added
351
+
352
+ - Improved error messaging when URL scheme is missing, or a non HTTP(S) scheme is used. (Pull #354)
353
+
354
+ ### Fixed
355
+
356
+ - Switched to `anyio` as the default backend implementation when running with `asyncio`. Resolves some awkward [TLS timeout issues](https://github.com/encode/httpx/discussions/1511).
357
+
358
+ ## 0.13.3 (May 6th, 2021)
359
+
360
+ ### Added
361
+
362
+ - Support HTTP/2 prior knowledge, using `httpcore.SyncConnectionPool(http1=False)`. (Pull #333)
363
+
364
+ ### Fixed
365
+
366
+ - Handle cases where environment does not provide `select.poll` support. (Pull #331)
367
+
368
+ ## 0.13.2 (April 29th, 2021)
369
+
370
+ ### Added
371
+
372
+ - Improve error message for specific case of `RemoteProtocolError` where server disconnects without sending a response. (Pull #313)
373
+
374
+ ## 0.13.1 (April 28th, 2021)
375
+
376
+ ### Fixed
377
+
378
+ - More resiliant testing for closed connections. (Pull #311)
379
+ - Don't raise exceptions on ungraceful connection closes. (Pull #310)
380
+
381
+ ## 0.13.0 (April 21st, 2021)
382
+
383
+ The 0.13 release updates the core API in order to match the HTTPX Transport API,
384
+ introduced in HTTPX 0.18 onwards.
385
+
386
+ An example of making requests with the new interface is:
387
+
388
+ ```python
389
+ with httpcore.SyncConnectionPool() as http:
390
+ status_code, headers, stream, extensions = http.handle_request(
391
+ method=b'GET',
392
+ url=(b'https', b'example.org', 443, b'/'),
393
+ headers=[(b'host', b'example.org'), (b'user-agent', b'httpcore')]
394
+ stream=httpcore.ByteStream(b''),
395
+ extensions={}
396
+ )
397
+ body = stream.read()
398
+ print(status_code, body)
399
+ ```
400
+
401
+ ### Changed
402
+
403
+ - The `.request()` method is now `handle_request()`. (Pull #296)
404
+ - The `.arequest()` method is now `.handle_async_request()`. (Pull #296)
405
+ - The `headers` argument is no longer optional. (Pull #296)
406
+ - The `stream` argument is no longer optional. (Pull #296)
407
+ - The `ext` argument is now named `extensions`, and is no longer optional. (Pull #296)
408
+ - The `"reason"` extension keyword is now named `"reason_phrase"`. (Pull #296)
409
+ - The `"reason_phrase"` and `"http_version"` extensions now use byte strings for their values. (Pull #296)
410
+ - The `httpcore.PlainByteStream()` class becomes `httpcore.ByteStream()`. (Pull #296)
411
+
412
+ ### Added
413
+
414
+ - Streams now support a `.read()` interface. (Pull #296)
415
+
416
+ ### Fixed
417
+
418
+ - Task cancellation no longer leaks connections from the connection pool. (Pull #305)
419
+
420
+ ## 0.12.3 (December 7th, 2020)
421
+
422
+ ### Fixed
423
+
424
+ - Abort SSL connections on close rather than waiting for remote EOF when using `asyncio`. (Pull #167)
425
+ - Fix exception raised in case of connect timeouts when using the `anyio` backend. (Pull #236)
426
+ - Fix `Host` header precedence for `:authority` in HTTP/2. (Pull #241, #243)
427
+ - Handle extra edge case when detecting for socket readability when using `asyncio`. (Pull #242, #244)
428
+ - Fix `asyncio` SSL warning when using proxy tunneling. (Pull #249)
429
+
430
+ ## 0.12.2 (November 20th, 2020)
431
+
432
+ ### Fixed
433
+
434
+ - Properly wrap connect errors on the asyncio backend. (Pull #235)
435
+ - Fix `ImportError` occurring on Python 3.9 when using the HTTP/1.1 sync client in a multithreaded context. (Pull #237)
436
+
437
+ ## 0.12.1 (November 7th, 2020)
438
+
439
+ ### Added
440
+
441
+ - Add connect retries. (Pull #221)
442
+
443
+ ### Fixed
444
+
445
+ - Tweak detection of dropped connections, resolving an issue with open files limits on Linux. (Pull #185)
446
+ - Avoid leaking connections when establishing an HTTP tunnel to a proxy has failed. (Pull #223)
447
+ - Properly wrap OS errors when using `trio`. (Pull #225)
448
+
449
+ ## 0.12.0 (October 6th, 2020)
450
+
451
+ ### Changed
452
+
453
+ - HTTP header casing is now preserved, rather than always sent in lowercase. (#216 and python-hyper/h11#104)
454
+
455
+ ### Added
456
+
457
+ - Add Python 3.9 to officially supported versions.
458
+
459
+ ### Fixed
460
+
461
+ - Gracefully handle a stdlib asyncio bug when a connection is closed while it is in a paused-for-reading state. (#201)
462
+
463
+ ## 0.11.1 (September 28nd, 2020)
464
+
465
+ ### Fixed
466
+
467
+ - Add await to async semaphore release() coroutine (#197)
468
+ - Drop incorrect curio classifier (#192)
469
+
470
+ ## 0.11.0 (September 22nd, 2020)
471
+
472
+ The Transport API with 0.11.0 has a couple of significant changes.
473
+
474
+ Firstly we've moved changed the request interface in order to allow extensions, which will later enable us to support features
475
+ such as trailing headers, HTTP/2 server push, and CONNECT/Upgrade connections.
476
+
477
+ The interface changes from:
478
+
479
+ ```python
480
+ def request(method, url, headers, stream, timeout):
481
+ return (http_version, status_code, reason, headers, stream)
482
+ ```
483
+
484
+ To instead including an optional dictionary of extensions on the request and response:
485
+
486
+ ```python
487
+ def request(method, url, headers, stream, ext):
488
+ return (status_code, headers, stream, ext)
489
+ ```
490
+
491
+ Having an open-ended extensions point will allow us to add later support for various optional features, that wouldn't otherwise be supported without these API changes.
492
+
493
+ In particular:
494
+
495
+ * Trailing headers support.
496
+ * HTTP/2 Server Push
497
+ * sendfile.
498
+ * Exposing raw connection on CONNECT, Upgrade, HTTP/2 bi-di streaming.
499
+ * Exposing debug information out of the API, including template name, template context.
500
+
501
+ Currently extensions are limited to:
502
+
503
+ * request: `timeout` - Optional. Timeout dictionary.
504
+ * response: `http_version` - Optional. Include the HTTP version used on the response.
505
+ * response: `reason` - Optional. Include the reason phrase used on the response. Only valid with HTTP/1.*.
506
+
507
+ See https://github.com/encode/httpx/issues/1274#issuecomment-694884553 for the history behind this.
508
+
509
+ Secondly, the async version of `request` is now namespaced as `arequest`.
510
+
511
+ This allows concrete transports to support both sync and async implementations on the same class.
512
+
513
+ ### Added
514
+
515
+ - Add curio support. (Pull #168)
516
+ - Add anyio support, with `backend="anyio"`. (Pull #169)
517
+
518
+ ### Changed
519
+
520
+ - Update the Transport API to use 'ext' for optional extensions. (Pull #190)
521
+ - Update the Transport API to use `.request` and `.arequest` so implementations can support both sync and async. (Pull #189)
522
+
523
+ ## 0.10.2 (August 20th, 2020)
524
+
525
+ ### Added
526
+
527
+ - Added Unix Domain Socket support. (Pull #139)
528
+
529
+ ### Fixed
530
+
531
+ - Always include the port on proxy CONNECT requests. (Pull #154)
532
+ - Fix `max_keepalive_connections` configuration. (Pull #153)
533
+ - Fixes behaviour in HTTP/1.1 where server disconnects can be used to signal the end of the response body. (Pull #164)
534
+
535
+ ## 0.10.1 (August 7th, 2020)
536
+
537
+ - Include `max_keepalive_connections` on `AsyncHTTPProxy`/`SyncHTTPProxy` classes.
538
+
539
+ ## 0.10.0 (August 7th, 2020)
540
+
541
+ The most notable change in the 0.10.0 release is that HTTP/2 support is now fully optional.
542
+
543
+ Use either `pip install httpcore` for HTTP/1.1 support only, or `pip install httpcore[http2]` for HTTP/1.1 and HTTP/2 support.
544
+
545
+ ### Added
546
+
547
+ - HTTP/2 support becomes optional. (Pull #121, #130)
548
+ - Add `local_address=...` support. (Pull #100, #134)
549
+ - Add `PlainByteStream`, `IteratorByteStream`, `AsyncIteratorByteStream`. The `AsyncByteSteam` and `SyncByteStream` classes are now pure interface classes. (#133)
550
+ - Add `LocalProtocolError`, `RemoteProtocolError` exceptions. (Pull #129)
551
+ - Add `UnsupportedProtocol` exception. (Pull #128)
552
+ - Add `.get_connection_info()` method. (Pull #102, #137)
553
+ - Add better TRACE logs. (Pull #101)
554
+
555
+ ### Changed
556
+
557
+ - `max_keepalive` is deprecated in favour of `max_keepalive_connections`. (Pull #140)
558
+
559
+ ### Fixed
560
+
561
+ - Improve handling of server disconnects. (Pull #112)
562
+
563
+ ## 0.9.1 (May 27th, 2020)
564
+
565
+ ### Fixed
566
+
567
+ - Proper host resolution for sync case, including IPv6 support. (Pull #97)
568
+ - Close outstanding connections when connection pool is closed. (Pull #98)
569
+
570
+ ## 0.9.0 (May 21th, 2020)
571
+
572
+ ### Changed
573
+
574
+ - URL port becomes an `Optional[int]` instead of `int`. (Pull #92)
575
+
576
+ ### Fixed
577
+
578
+ - Honor HTTP/2 max concurrent streams settings. (Pull #89, #90)
579
+ - Remove incorrect debug log. (Pull #83)
580
+
581
+ ## 0.8.4 (May 11th, 2020)
582
+
583
+ ### Added
584
+
585
+ - Logging via HTTPCORE_LOG_LEVEL and HTTPX_LOG_LEVEL environment variables
586
+ and TRACE level logging. (Pull #79)
587
+
588
+ ### Fixed
589
+
590
+ - Reuse of connections on HTTP/2 in close concurrency situations. (Pull #81)
591
+
592
+ ## 0.8.3 (May 6rd, 2020)
593
+
594
+ ### Fixed
595
+
596
+ - Include `Host` and `Accept` headers on proxy "CONNECT" requests.
597
+ - De-duplicate any headers also contained in proxy_headers.
598
+ - HTTP/2 flag not being passed down to proxy connections.
599
+
600
+ ## 0.8.2 (May 3rd, 2020)
601
+
602
+ ### Fixed
603
+
604
+ - Fix connections using proxy forwarding requests not being added to the
605
+ connection pool properly. (Pull #70)
606
+
607
+ ## 0.8.1 (April 30th, 2020)
608
+
609
+ ### Changed
610
+
611
+ - Allow inherintance of both `httpcore.AsyncByteStream`, `httpcore.SyncByteStream` without type conflicts.
612
+
613
+ ## 0.8.0 (April 30th, 2020)
614
+
615
+ ### Fixed
616
+
617
+ - Fixed tunnel proxy support.
618
+
619
+ ### Added
620
+
621
+ - New `TimeoutException` base class.
622
+
623
+ ## 0.7.0 (March 5th, 2020)
624
+
625
+ - First integration with HTTPX.
venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/RECORD ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ httpcore-1.0.9.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ httpcore-1.0.9.dist-info/METADATA,sha256=_i1P2mGZEol4d54M8n88BFxTGGP83Zh-rMdPOhjUHCE,21529
3
+ httpcore-1.0.9.dist-info/RECORD,,
4
+ httpcore-1.0.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
+ httpcore-1.0.9.dist-info/licenses/LICENSE.md,sha256=_ctZFUx0y6uhahEkL3dAvqnyPW_rVUeRfYxflKgDkqU,1518
6
+ httpcore/__init__.py,sha256=9kT_kqChCCJUTHww24ZmR_ezcdbpRYWksD-gYNzkZP8,3445
7
+ httpcore/__pycache__/__init__.cpython-310.pyc,,
8
+ httpcore/__pycache__/_api.cpython-310.pyc,,
9
+ httpcore/__pycache__/_exceptions.cpython-310.pyc,,
10
+ httpcore/__pycache__/_models.cpython-310.pyc,,
11
+ httpcore/__pycache__/_ssl.cpython-310.pyc,,
12
+ httpcore/__pycache__/_synchronization.cpython-310.pyc,,
13
+ httpcore/__pycache__/_trace.cpython-310.pyc,,
14
+ httpcore/__pycache__/_utils.cpython-310.pyc,,
15
+ httpcore/_api.py,sha256=unZmeDschBWCGCPCwkS3Wot9euK6bg_kKxLtGTxw214,3146
16
+ httpcore/_async/__init__.py,sha256=EWdl2v4thnAHzJpqjU4h2a8DUiGAvNiWrkii9pfhTf0,1221
17
+ httpcore/_async/__pycache__/__init__.cpython-310.pyc,,
18
+ httpcore/_async/__pycache__/connection.cpython-310.pyc,,
19
+ httpcore/_async/__pycache__/connection_pool.cpython-310.pyc,,
20
+ httpcore/_async/__pycache__/http11.cpython-310.pyc,,
21
+ httpcore/_async/__pycache__/http2.cpython-310.pyc,,
22
+ httpcore/_async/__pycache__/http_proxy.cpython-310.pyc,,
23
+ httpcore/_async/__pycache__/interfaces.cpython-310.pyc,,
24
+ httpcore/_async/__pycache__/socks_proxy.cpython-310.pyc,,
25
+ httpcore/_async/connection.py,sha256=6OcPXqMEfc0BU38_-iHUNDd1vKSTc2UVT09XqNb_BOk,8449
26
+ httpcore/_async/connection_pool.py,sha256=DOIQ2s2ZCf9qfwxhzMprTPLqCL8OxGXiKF6qRHxvVyY,17307
27
+ httpcore/_async/http11.py,sha256=-qM9bV7PjSQF5vxs37-eUXOIFwbIjPcZbNliuX9TtBw,13880
28
+ httpcore/_async/http2.py,sha256=azX1fcmtXaIwjputFlZ4vd92J8xwjGOa9ax9QIv4394,23936
29
+ httpcore/_async/http_proxy.py,sha256=2zVkrlv-Ds-rWGaqaXlrhEJiAQFPo23BT3Gq_sWoBXU,14701
30
+ httpcore/_async/interfaces.py,sha256=jTiaWL83pgpGC9ziv90ZfwaKNMmHwmOalzaKiuTxATo,4455
31
+ httpcore/_async/socks_proxy.py,sha256=lLKgLlggPfhFlqi0ODeBkOWvt9CghBBUyqsnsU1tx6Q,13841
32
+ httpcore/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ httpcore/_backends/__pycache__/__init__.cpython-310.pyc,,
34
+ httpcore/_backends/__pycache__/anyio.cpython-310.pyc,,
35
+ httpcore/_backends/__pycache__/auto.cpython-310.pyc,,
36
+ httpcore/_backends/__pycache__/base.cpython-310.pyc,,
37
+ httpcore/_backends/__pycache__/mock.cpython-310.pyc,,
38
+ httpcore/_backends/__pycache__/sync.cpython-310.pyc,,
39
+ httpcore/_backends/__pycache__/trio.cpython-310.pyc,,
40
+ httpcore/_backends/anyio.py,sha256=x8PgEhXRC8bVqsdzk_YJx8Y6d9Tub06CuUSwnbmtqoY,5252
41
+ httpcore/_backends/auto.py,sha256=zO136PKZmsaTDK-HRk84eA-MUg8_2wJf4NvmK432Aio,1662
42
+ httpcore/_backends/base.py,sha256=aShgRdZnMmRhFWHetjumlM73f8Kz1YOAyCUP_4kHslA,3042
43
+ httpcore/_backends/mock.py,sha256=er9T436uSe7NLrfiLa4x6Nuqg5ivQ693CxWYCWsgbH4,4077
44
+ httpcore/_backends/sync.py,sha256=bhE4d9iK9Umxdsdsgm2EfKnXaBms2WggGYU-7jmUujU,7977
45
+ httpcore/_backends/trio.py,sha256=LHu4_Mr5MswQmmT3yE4oLgf9b_JJfeVS4BjDxeJc7Ro,5996
46
+ httpcore/_exceptions.py,sha256=looCKga3_YVYu3s-d3L9RMPRJyhsY7fiuuGxvkOD0c0,1184
47
+ httpcore/_models.py,sha256=IO2CcXcdpovRcLTdGFGB6RyBZdEm2h_TOmoCc4rEKho,17623
48
+ httpcore/_ssl.py,sha256=srqmSNU4iOUvWF-SrJvb8G_YEbHFELOXQOwdDIBTS9c,187
49
+ httpcore/_sync/__init__.py,sha256=JBDIgXt5la1LCJ1sLQeKhjKFpLnpNr8Svs6z2ni3fgg,1141
50
+ httpcore/_sync/__pycache__/__init__.cpython-310.pyc,,
51
+ httpcore/_sync/__pycache__/connection.cpython-310.pyc,,
52
+ httpcore/_sync/__pycache__/connection_pool.cpython-310.pyc,,
53
+ httpcore/_sync/__pycache__/http11.cpython-310.pyc,,
54
+ httpcore/_sync/__pycache__/http2.cpython-310.pyc,,
55
+ httpcore/_sync/__pycache__/http_proxy.cpython-310.pyc,,
56
+ httpcore/_sync/__pycache__/interfaces.cpython-310.pyc,,
57
+ httpcore/_sync/__pycache__/socks_proxy.cpython-310.pyc,,
58
+ httpcore/_sync/connection.py,sha256=9exGOb3PB-Mp2T1-sckSeL2t-tJ_9-NXomV8ihmWCgU,8238
59
+ httpcore/_sync/connection_pool.py,sha256=a-T8LTsUxc7r0Ww1atfHSDoWPjQ0fA8Ul7S3-F0Mj70,16955
60
+ httpcore/_sync/http11.py,sha256=IFobD1Md5JFlJGKWnh1_Q3epikUryI8qo09v8MiJIEA,13476
61
+ httpcore/_sync/http2.py,sha256=AxU4yhcq68Bn5vqdJYtiXKYUj7nvhYbxz3v4rT4xnvA,23400
62
+ httpcore/_sync/http_proxy.py,sha256=_al_6crKuEZu2wyvu493RZImJdBJnj5oGKNjLOJL2Zo,14463
63
+ httpcore/_sync/interfaces.py,sha256=snXON42vUDHO5JBJvo8D4VWk2Wat44z2OXXHDrjbl94,4344
64
+ httpcore/_sync/socks_proxy.py,sha256=zegZW9Snqj2_992DFJa8_CppOVBkVL4AgwduRkStakQ,13614
65
+ httpcore/_synchronization.py,sha256=zSi13mAColBnknjZBknUC6hKNDQT4C6ijnezZ-r0T2s,9434
66
+ httpcore/_trace.py,sha256=ck6ZoIzYTkdNAIfq5MGeKqBXDtqjOX-qfYwmZFbrGco,3952
67
+ httpcore/_utils.py,sha256=_RLgXYOAYC350ikALV59GZ68IJrdocRZxPs9PjmzdFY,1537
68
+ httpcore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/WHEEL ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
venv/lib/python3.10/site-packages/httpcore-1.0.9.dist-info/licenses/LICENSE.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright © 2020, [Encode OSS Ltd](https://www.encode.io/).
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.