BryanW commited on
Commit
24c31ad
·
verified ·
1 Parent(s): 0eaabec

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc +0 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc +0 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc +0 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc +0 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc +0 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc +0 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc +0 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc +0 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc +0 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc +0 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc +0 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc +0 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc +0 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc +0 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc +0 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc +0 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc +0 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc +0 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc +0 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc +0 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py +0 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py +304 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py +75 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py +232 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py +939 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py +152 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py +58 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py +467 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py +364 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py +385 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py +127 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py +227 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py +871 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py +124 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py +213 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py +467 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py +303 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py +1160 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py +180 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py +333 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py +124 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py +484 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py +134 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py +0 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py +188 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py +498 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py +230 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py +255 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py +89 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py +164 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.86 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc ADDED
Binary file (617 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc ADDED
Binary file (4.2 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc ADDED
Binary file (45.5 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc ADDED
Binary file (37.8 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc ADDED
Binary file (13 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc ADDED
Binary file (7.26 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc ADDED
Binary file (6.06 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc ADDED
Binary file (3.16 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc ADDED
Binary file (4.56 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc ADDED
Binary file (868 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc ADDED
Binary file (15.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc ADDED
Binary file (19.4 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc ADDED
Binary file (21.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc ADDED
Binary file (12 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc ADDED
Binary file (15.8 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc ADDED
Binary file (10.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc ADDED
Binary file (79.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc ADDED
Binary file (4.63 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc ADDED
Binary file (28.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import errno
2
+ import io
3
+ import os
4
+ import secrets
5
+ import shutil
6
+ from contextlib import suppress
7
+ from functools import cached_property, wraps
8
+ from urllib.parse import parse_qs
9
+
10
+ from fsspec.spec import AbstractFileSystem
11
+ from fsspec.utils import (
12
+ get_package_version_without_import,
13
+ infer_storage_options,
14
+ mirror_from,
15
+ tokenize,
16
+ )
17
+
18
+
19
+ def wrap_exceptions(func):
20
+ @wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ try:
23
+ return func(*args, **kwargs)
24
+ except OSError as exception:
25
+ if not exception.args:
26
+ raise
27
+
28
+ message, *args = exception.args
29
+ if isinstance(message, str) and "does not exist" in message:
30
+ raise FileNotFoundError(errno.ENOENT, message) from exception
31
+ else:
32
+ raise
33
+
34
+ return wrapper
35
+
36
+
37
+ PYARROW_VERSION = None
38
+
39
+
40
+ class ArrowFSWrapper(AbstractFileSystem):
41
+ """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
42
+
43
+ Parameters
44
+ ----------
45
+ fs : pyarrow.fs.FileSystem
46
+
47
+ """
48
+
49
+ root_marker = "/"
50
+
51
+ def __init__(self, fs, **kwargs):
52
+ global PYARROW_VERSION
53
+ PYARROW_VERSION = get_package_version_without_import("pyarrow")
54
+ self.fs = fs
55
+ super().__init__(**kwargs)
56
+
57
+ @property
58
+ def protocol(self):
59
+ return self.fs.type_name
60
+
61
+ @cached_property
62
+ def fsid(self):
63
+ return "hdfs_" + tokenize(self.fs.host, self.fs.port)
64
+
65
+ @classmethod
66
+ def _strip_protocol(cls, path):
67
+ ops = infer_storage_options(path)
68
+ path = ops["path"]
69
+ if path.startswith("//"):
70
+ # special case for "hdfs://path" (without the triple slash)
71
+ path = path[1:]
72
+ return path
73
+
74
+ def ls(self, path, detail=False, **kwargs):
75
+ path = self._strip_protocol(path)
76
+ from pyarrow.fs import FileSelector
77
+
78
+ entries = [
79
+ self._make_entry(entry)
80
+ for entry in self.fs.get_file_info(FileSelector(path))
81
+ ]
82
+ if detail:
83
+ return entries
84
+ else:
85
+ return [entry["name"] for entry in entries]
86
+
87
+ def info(self, path, **kwargs):
88
+ path = self._strip_protocol(path)
89
+ [info] = self.fs.get_file_info([path])
90
+ return self._make_entry(info)
91
+
92
+ def exists(self, path):
93
+ path = self._strip_protocol(path)
94
+ try:
95
+ self.info(path)
96
+ except FileNotFoundError:
97
+ return False
98
+ else:
99
+ return True
100
+
101
+ def _make_entry(self, info):
102
+ from pyarrow.fs import FileType
103
+
104
+ if info.type is FileType.Directory:
105
+ kind = "directory"
106
+ elif info.type is FileType.File:
107
+ kind = "file"
108
+ elif info.type is FileType.NotFound:
109
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
110
+ else:
111
+ kind = "other"
112
+
113
+ return {
114
+ "name": info.path,
115
+ "size": info.size,
116
+ "type": kind,
117
+ "mtime": info.mtime,
118
+ }
119
+
120
+ @wrap_exceptions
121
+ def cp_file(self, path1, path2, **kwargs):
122
+ path1 = self._strip_protocol(path1).rstrip("/")
123
+ path2 = self._strip_protocol(path2).rstrip("/")
124
+
125
+ with self._open(path1, "rb") as lstream:
126
+ tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
127
+ try:
128
+ with self.open(tmp_fname, "wb") as rstream:
129
+ shutil.copyfileobj(lstream, rstream)
130
+ self.fs.move(tmp_fname, path2)
131
+ except BaseException: # noqa
132
+ with suppress(FileNotFoundError):
133
+ self.fs.delete_file(tmp_fname)
134
+ raise
135
+
136
+ @wrap_exceptions
137
+ def mv(self, path1, path2, **kwargs):
138
+ path1 = self._strip_protocol(path1).rstrip("/")
139
+ path2 = self._strip_protocol(path2).rstrip("/")
140
+ self.fs.move(path1, path2)
141
+
142
+ @wrap_exceptions
143
+ def rm_file(self, path):
144
+ path = self._strip_protocol(path)
145
+ self.fs.delete_file(path)
146
+
147
+ @wrap_exceptions
148
+ def rm(self, path, recursive=False, maxdepth=None):
149
+ path = self._strip_protocol(path).rstrip("/")
150
+ if self.isdir(path):
151
+ if recursive:
152
+ self.fs.delete_dir(path)
153
+ else:
154
+ raise ValueError("Can't delete directories without recursive=False")
155
+ else:
156
+ self.fs.delete_file(path)
157
+
158
+ @wrap_exceptions
159
+ def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
160
+ if mode == "rb":
161
+ if seekable:
162
+ method = self.fs.open_input_file
163
+ else:
164
+ method = self.fs.open_input_stream
165
+ elif mode == "wb":
166
+ method = self.fs.open_output_stream
167
+ elif mode == "ab":
168
+ method = self.fs.open_append_stream
169
+ else:
170
+ raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
171
+
172
+ _kwargs = {}
173
+ if mode != "rb" or not seekable:
174
+ if int(PYARROW_VERSION.split(".")[0]) >= 4:
175
+ # disable compression auto-detection
176
+ _kwargs["compression"] = None
177
+ stream = method(path, **_kwargs)
178
+
179
+ return ArrowFile(self, stream, path, mode, block_size, **kwargs)
180
+
181
+ @wrap_exceptions
182
+ def mkdir(self, path, create_parents=True, **kwargs):
183
+ path = self._strip_protocol(path)
184
+ if create_parents:
185
+ self.makedirs(path, exist_ok=True)
186
+ else:
187
+ self.fs.create_dir(path, recursive=False)
188
+
189
+ @wrap_exceptions
190
+ def makedirs(self, path, exist_ok=False):
191
+ path = self._strip_protocol(path)
192
+ self.fs.create_dir(path, recursive=True)
193
+
194
+ @wrap_exceptions
195
+ def rmdir(self, path):
196
+ path = self._strip_protocol(path)
197
+ self.fs.delete_dir(path)
198
+
199
+ @wrap_exceptions
200
+ def modified(self, path):
201
+ path = self._strip_protocol(path)
202
+ return self.fs.get_file_info(path).mtime
203
+
204
+ def cat_file(self, path, start=None, end=None, **kwargs):
205
+ kwargs["seekable"] = start not in [None, 0]
206
+ return super().cat_file(path, start=None, end=None, **kwargs)
207
+
208
+ def get_file(self, rpath, lpath, **kwargs):
209
+ kwargs["seekable"] = False
210
+ super().get_file(rpath, lpath, **kwargs)
211
+
212
+
213
+ @mirror_from(
214
+ "stream",
215
+ [
216
+ "read",
217
+ "seek",
218
+ "tell",
219
+ "write",
220
+ "readable",
221
+ "writable",
222
+ "close",
223
+ "size",
224
+ "seekable",
225
+ ],
226
+ )
227
+ class ArrowFile(io.IOBase):
228
+ def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
229
+ self.path = path
230
+ self.mode = mode
231
+
232
+ self.fs = fs
233
+ self.stream = stream
234
+
235
+ self.blocksize = self.block_size = block_size
236
+ self.kwargs = kwargs
237
+
238
+ def __enter__(self):
239
+ return self
240
+
241
+ def __exit__(self, *args):
242
+ return self.close()
243
+
244
+
245
+ class HadoopFileSystem(ArrowFSWrapper):
246
+ """A wrapper on top of the pyarrow.fs.HadoopFileSystem
247
+ to connect it's interface with fsspec"""
248
+
249
+ protocol = "hdfs"
250
+
251
+ def __init__(
252
+ self,
253
+ host="default",
254
+ port=0,
255
+ user=None,
256
+ kerb_ticket=None,
257
+ replication=3,
258
+ extra_conf=None,
259
+ **kwargs,
260
+ ):
261
+ """
262
+
263
+ Parameters
264
+ ----------
265
+ host: str
266
+ Hostname, IP or "default" to try to read from Hadoop config
267
+ port: int
268
+ Port to connect on, or default from Hadoop config if 0
269
+ user: str or None
270
+ If given, connect as this username
271
+ kerb_ticket: str or None
272
+ If given, use this ticket for authentication
273
+ replication: int
274
+ set replication factor of file for write operations. default value is 3.
275
+ extra_conf: None or dict
276
+ Passed on to HadoopFileSystem
277
+ """
278
+ from pyarrow.fs import HadoopFileSystem
279
+
280
+ fs = HadoopFileSystem(
281
+ host=host,
282
+ port=port,
283
+ user=user,
284
+ kerb_ticket=kerb_ticket,
285
+ replication=replication,
286
+ extra_conf=extra_conf,
287
+ )
288
+ super().__init__(fs=fs, **kwargs)
289
+
290
+ @staticmethod
291
+ def _get_kwargs_from_urls(path):
292
+ ops = infer_storage_options(path)
293
+ out = {}
294
+ if ops.get("host", None):
295
+ out["host"] = ops["host"]
296
+ if ops.get("username", None):
297
+ out["user"] = ops["username"]
298
+ if ops.get("port", None):
299
+ out["port"] = ops["port"]
300
+ if ops.get("url_query", None):
301
+ queries = parse_qs(ops["url_query"])
302
+ if queries.get("replication", None):
303
+ out["replication"] = int(queries["replication"][0])
304
+ return out
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import hashlib
5
+
6
+ from fsspec.implementations.local import make_path_posix
7
+
8
+
9
+ class AbstractCacheMapper(abc.ABC):
10
+ """Abstract super-class for mappers from remote URLs to local cached
11
+ basenames.
12
+ """
13
+
14
+ @abc.abstractmethod
15
+ def __call__(self, path: str) -> str: ...
16
+
17
+ def __eq__(self, other: object) -> bool:
18
+ # Identity only depends on class. When derived classes have attributes
19
+ # they will need to be included.
20
+ return isinstance(other, type(self))
21
+
22
+ def __hash__(self) -> int:
23
+ # Identity only depends on class. When derived classes have attributes
24
+ # they will need to be included.
25
+ return hash(type(self))
26
+
27
+
28
+ class BasenameCacheMapper(AbstractCacheMapper):
29
+ """Cache mapper that uses the basename of the remote URL and a fixed number
30
+ of directory levels above this.
31
+
32
+ The default is zero directory levels, meaning different paths with the same
33
+ basename will have the same cached basename.
34
+ """
35
+
36
+ def __init__(self, directory_levels: int = 0):
37
+ if directory_levels < 0:
38
+ raise ValueError(
39
+ "BasenameCacheMapper requires zero or positive directory_levels"
40
+ )
41
+ self.directory_levels = directory_levels
42
+
43
+ # Separator for directories when encoded as strings.
44
+ self._separator = "_@_"
45
+
46
+ def __call__(self, path: str) -> str:
47
+ path = make_path_posix(path)
48
+ prefix, *bits = path.rsplit("/", self.directory_levels + 1)
49
+ if bits:
50
+ return self._separator.join(bits)
51
+ else:
52
+ return prefix # No separator found, simple filename
53
+
54
+ def __eq__(self, other: object) -> bool:
55
+ return super().__eq__(other) and self.directory_levels == other.directory_levels
56
+
57
+ def __hash__(self) -> int:
58
+ return super().__hash__() ^ hash(self.directory_levels)
59
+
60
+
61
+ class HashCacheMapper(AbstractCacheMapper):
62
+ """Cache mapper that uses a hash of the remote URL."""
63
+
64
+ def __call__(self, path: str) -> str:
65
+ return hashlib.sha256(path.encode()).hexdigest()
66
+
67
+
68
+ def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
69
+ """Factory method to create cache mapper for backward compatibility with
70
+ ``CachingFileSystem`` constructor using ``same_names`` kwarg.
71
+ """
72
+ if same_names:
73
+ return BasenameCacheMapper()
74
+ else:
75
+ return HashCacheMapper()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+ import time
6
+ from typing import TYPE_CHECKING
7
+
8
+ from fsspec.utils import atomic_write
9
+
10
+ try:
11
+ import ujson as json
12
+ except ImportError:
13
+ if not TYPE_CHECKING:
14
+ import json
15
+
16
+ if TYPE_CHECKING:
17
+ from typing import Any, Dict, Iterator, Literal
18
+
19
+ from typing_extensions import TypeAlias
20
+
21
+ from .cached import CachingFileSystem
22
+
23
+ Detail: TypeAlias = Dict[str, Any]
24
+
25
+
26
+ class CacheMetadata:
27
+ """Cache metadata.
28
+
29
+ All reading and writing of cache metadata is performed by this class,
30
+ accessing the cached files and blocks is not.
31
+
32
+ Metadata is stored in a single file per storage directory in JSON format.
33
+ For backward compatibility, also reads metadata stored in pickle format
34
+ which is converted to JSON when next saved.
35
+ """
36
+
37
+ def __init__(self, storage: list[str]):
38
+ """
39
+
40
+ Parameters
41
+ ----------
42
+ storage: list[str]
43
+ Directories containing cached files, must be at least one. Metadata
44
+ is stored in the last of these directories by convention.
45
+ """
46
+ if not storage:
47
+ raise ValueError("CacheMetadata expects at least one storage location")
48
+
49
+ self._storage = storage
50
+ self.cached_files: list[Detail] = [{}]
51
+
52
+ # Private attribute to force saving of metadata in pickle format rather than
53
+ # JSON for use in tests to confirm can read both pickle and JSON formats.
54
+ self._force_save_pickle = False
55
+
56
+ def _load(self, fn: str) -> Detail:
57
+ """Low-level function to load metadata from specific file"""
58
+ try:
59
+ with open(fn, "r") as f:
60
+ loaded = json.load(f)
61
+ except ValueError:
62
+ with open(fn, "rb") as f:
63
+ loaded = pickle.load(f)
64
+ for c in loaded.values():
65
+ if isinstance(c.get("blocks"), list):
66
+ c["blocks"] = set(c["blocks"])
67
+ return loaded
68
+
69
+ def _save(self, metadata_to_save: Detail, fn: str) -> None:
70
+ """Low-level function to save metadata to specific file"""
71
+ if self._force_save_pickle:
72
+ with atomic_write(fn) as f:
73
+ pickle.dump(metadata_to_save, f)
74
+ else:
75
+ with atomic_write(fn, mode="w") as f:
76
+ json.dump(metadata_to_save, f)
77
+
78
+ def _scan_locations(
79
+ self, writable_only: bool = False
80
+ ) -> Iterator[tuple[str, str, bool]]:
81
+ """Yield locations (filenames) where metadata is stored, and whether
82
+ writable or not.
83
+
84
+ Parameters
85
+ ----------
86
+ writable: bool
87
+ Set to True to only yield writable locations.
88
+
89
+ Returns
90
+ -------
91
+ Yields (str, str, bool)
92
+ """
93
+ n = len(self._storage)
94
+ for i, storage in enumerate(self._storage):
95
+ writable = i == n - 1
96
+ if writable_only and not writable:
97
+ continue
98
+ yield os.path.join(storage, "cache"), storage, writable
99
+
100
+ def check_file(
101
+ self, path: str, cfs: CachingFileSystem | None
102
+ ) -> Literal[False] | tuple[Detail, str]:
103
+ """If path is in cache return its details, otherwise return ``False``.
104
+
105
+ If the optional CachingFileSystem is specified then it is used to
106
+ perform extra checks to reject possible matches, such as if they are
107
+ too old.
108
+ """
109
+ for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
110
+ if path not in cache:
111
+ continue
112
+ detail = cache[path].copy()
113
+
114
+ if cfs is not None:
115
+ if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
116
+ # Wrong file as determined by hash of file properties
117
+ continue
118
+ if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
119
+ # Cached file has expired
120
+ continue
121
+
122
+ fn = os.path.join(base, detail["fn"])
123
+ if os.path.exists(fn):
124
+ return detail, fn
125
+ return False
126
+
127
+ def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
128
+ """Remove expired metadata from the cache.
129
+
130
+ Returns names of files corresponding to expired metadata and a boolean
131
+ flag indicating whether the writable cache is empty. Caller is
132
+ responsible for deleting the expired files.
133
+ """
134
+ expired_files = []
135
+ for path, detail in self.cached_files[-1].copy().items():
136
+ if time.time() - detail["time"] > expiry_time:
137
+ fn = detail.get("fn", "")
138
+ if not fn:
139
+ raise RuntimeError(
140
+ f"Cache metadata does not contain 'fn' for {path}"
141
+ )
142
+ fn = os.path.join(self._storage[-1], fn)
143
+ expired_files.append(fn)
144
+ self.cached_files[-1].pop(path)
145
+
146
+ if self.cached_files[-1]:
147
+ cache_path = os.path.join(self._storage[-1], "cache")
148
+ self._save(self.cached_files[-1], cache_path)
149
+
150
+ writable_cache_empty = not self.cached_files[-1]
151
+ return expired_files, writable_cache_empty
152
+
153
+ def load(self) -> None:
154
+ """Load all metadata from disk and store in ``self.cached_files``"""
155
+ cached_files = []
156
+ for fn, _, _ in self._scan_locations():
157
+ if os.path.exists(fn):
158
+ # TODO: consolidate blocks here
159
+ cached_files.append(self._load(fn))
160
+ else:
161
+ cached_files.append({})
162
+ self.cached_files = cached_files or [{}]
163
+
164
+ def on_close_cached_file(self, f: Any, path: str) -> None:
165
+ """Perform side-effect actions on closing a cached file.
166
+
167
+ The actual closing of the file is the responsibility of the caller.
168
+ """
169
+ # File must be writeble, so in self.cached_files[-1]
170
+ c = self.cached_files[-1][path]
171
+ if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
172
+ c["blocks"] = True
173
+
174
+ def pop_file(self, path: str) -> str | None:
175
+ """Remove metadata of cached file.
176
+
177
+ If path is in the cache, return the filename of the cached file,
178
+ otherwise return ``None``. Caller is responsible for deleting the
179
+ cached file.
180
+ """
181
+ details = self.check_file(path, None)
182
+ if not details:
183
+ return None
184
+ _, fn = details
185
+ if fn.startswith(self._storage[-1]):
186
+ self.cached_files[-1].pop(path)
187
+ self.save()
188
+ else:
189
+ raise PermissionError(
190
+ "Can only delete cached file in last, writable cache location"
191
+ )
192
+ return fn
193
+
194
+ def save(self) -> None:
195
+ """Save metadata to disk"""
196
+ for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
197
+ if not writable:
198
+ continue
199
+
200
+ if os.path.exists(fn):
201
+ cached_files = self._load(fn)
202
+ for k, c in cached_files.items():
203
+ if k in cache:
204
+ if c["blocks"] is True or cache[k]["blocks"] is True:
205
+ c["blocks"] = True
206
+ else:
207
+ # self.cached_files[*][*]["blocks"] must continue to
208
+ # point to the same set object so that updates
209
+ # performed by MMapCache are propagated back to
210
+ # self.cached_files.
211
+ blocks = cache[k]["blocks"]
212
+ blocks.update(c["blocks"])
213
+ c["blocks"] = blocks
214
+ c["time"] = max(c["time"], cache[k]["time"])
215
+ c["uid"] = cache[k]["uid"]
216
+
217
+ # Files can be added to cache after it was written once
218
+ for k, c in cache.items():
219
+ if k not in cached_files:
220
+ cached_files[k] = c
221
+ else:
222
+ cached_files = cache
223
+ cache = {k: v.copy() for k, v in cached_files.items()}
224
+ for c in cache.values():
225
+ if isinstance(c["blocks"], set):
226
+ c["blocks"] = list(c["blocks"])
227
+ self._save(cache, fn)
228
+ self.cached_files[-1] = cached_files
229
+
230
+ def update_file(self, path: str, detail: Detail) -> None:
231
+ """Update metadata for specific file in memory, do not save"""
232
+ self.cached_files[-1][path] = detail
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py ADDED
@@ -0,0 +1,939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import tempfile
7
+ import time
8
+ import weakref
9
+ from shutil import rmtree
10
+ from typing import TYPE_CHECKING, Any, Callable, ClassVar
11
+
12
+ from fsspec import AbstractFileSystem, filesystem
13
+ from fsspec.callbacks import DEFAULT_CALLBACK
14
+ from fsspec.compression import compr
15
+ from fsspec.core import BaseCache, MMapCache
16
+ from fsspec.exceptions import BlocksizeMismatchError
17
+ from fsspec.implementations.cache_mapper import create_cache_mapper
18
+ from fsspec.implementations.cache_metadata import CacheMetadata
19
+ from fsspec.spec import AbstractBufferedFile
20
+ from fsspec.transaction import Transaction
21
+ from fsspec.utils import infer_compression
22
+
23
+ if TYPE_CHECKING:
24
+ from fsspec.implementations.cache_mapper import AbstractCacheMapper
25
+
26
+ logger = logging.getLogger("fsspec.cached")
27
+
28
+
29
+ class WriteCachedTransaction(Transaction):
30
+ def complete(self, commit=True):
31
+ rpaths = [f.path for f in self.files]
32
+ lpaths = [f.fn for f in self.files]
33
+ if commit:
34
+ self.fs.put(lpaths, rpaths)
35
+ self.files.clear()
36
+ self.fs._intrans = False
37
+ self.fs._transaction = None
38
+ self.fs = None # break cycle
39
+
40
+
41
+ class CachingFileSystem(AbstractFileSystem):
42
+ """Locally caching filesystem, layer over any other FS
43
+
44
+ This class implements chunk-wise local storage of remote files, for quick
45
+ access after the initial download. The files are stored in a given
46
+ directory with hashes of URLs for the filenames. If no directory is given,
47
+ a temporary one is used, which should be cleaned up by the OS after the
48
+ process ends. The files themselves are sparse (as implemented in
49
+ :class:`~fsspec.caching.MMapCache`), so only the data which is accessed
50
+ takes up space.
51
+
52
+ Restrictions:
53
+
54
+ - the block-size must be the same for each access of a given file, unless
55
+ all blocks of the file have already been read
56
+ - caching can only be applied to file-systems which produce files
57
+ derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
58
+ allowed, for testing
59
+ """
60
+
61
+ protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
62
+
63
+ def __init__(
64
+ self,
65
+ target_protocol=None,
66
+ cache_storage="TMP",
67
+ cache_check=10,
68
+ check_files=False,
69
+ expiry_time=604800,
70
+ target_options=None,
71
+ fs=None,
72
+ same_names: bool | None = None,
73
+ compression=None,
74
+ cache_mapper: AbstractCacheMapper | None = None,
75
+ **kwargs,
76
+ ):
77
+ """
78
+
79
+ Parameters
80
+ ----------
81
+ target_protocol: str (optional)
82
+ Target filesystem protocol. Provide either this or ``fs``.
83
+ cache_storage: str or list(str)
84
+ Location to store files. If "TMP", this is a temporary directory,
85
+ and will be cleaned up by the OS when this process ends (or later).
86
+ If a list, each location will be tried in the order given, but
87
+ only the last will be considered writable.
88
+ cache_check: int
89
+ Number of seconds between reload of cache metadata
90
+ check_files: bool
91
+ Whether to explicitly see if the UID of the remote file matches
92
+ the stored one before using. Warning: some file systems such as
93
+ HTTP cannot reliably give a unique hash of the contents of some
94
+ path, so be sure to set this option to False.
95
+ expiry_time: int
96
+ The time in seconds after which a local copy is considered useless.
97
+ Set to falsy to prevent expiry. The default is equivalent to one
98
+ week.
99
+ target_options: dict or None
100
+ Passed to the instantiation of the FS, if fs is None.
101
+ fs: filesystem instance
102
+ The target filesystem to run against. Provide this or ``protocol``.
103
+ same_names: bool (optional)
104
+ By default, target URLs are hashed using a ``HashCacheMapper`` so
105
+ that files from different backends with the same basename do not
106
+ conflict. If this argument is ``true``, a ``BasenameCacheMapper``
107
+ is used instead. Other cache mapper options are available by using
108
+ the ``cache_mapper`` keyword argument. Only one of this and
109
+ ``cache_mapper`` should be specified.
110
+ compression: str (optional)
111
+ To decompress on download. Can be 'infer' (guess from the URL name),
112
+ one of the entries in ``fsspec.compression.compr``, or None for no
113
+ decompression.
114
+ cache_mapper: AbstractCacheMapper (optional)
115
+ The object use to map from original filenames to cached filenames.
116
+ Only one of this and ``same_names`` should be specified.
117
+ """
118
+ super().__init__(**kwargs)
119
+ if fs is None and target_protocol is None:
120
+ raise ValueError(
121
+ "Please provide filesystem instance(fs) or target_protocol"
122
+ )
123
+ if not (fs is None) ^ (target_protocol is None):
124
+ raise ValueError(
125
+ "Both filesystems (fs) and target_protocol may not be both given."
126
+ )
127
+ if cache_storage == "TMP":
128
+ tempdir = tempfile.mkdtemp()
129
+ storage = [tempdir]
130
+ weakref.finalize(self, self._remove_tempdir, tempdir)
131
+ else:
132
+ if isinstance(cache_storage, str):
133
+ storage = [cache_storage]
134
+ else:
135
+ storage = cache_storage
136
+ os.makedirs(storage[-1], exist_ok=True)
137
+ self.storage = storage
138
+ self.kwargs = target_options or {}
139
+ self.cache_check = cache_check
140
+ self.check_files = check_files
141
+ self.expiry = expiry_time
142
+ self.compression = compression
143
+
144
+ # Size of cache in bytes. If None then the size is unknown and will be
145
+ # recalculated the next time cache_size() is called. On writes to the
146
+ # cache this is reset to None.
147
+ self._cache_size = None
148
+
149
+ if same_names is not None and cache_mapper is not None:
150
+ raise ValueError(
151
+ "Cannot specify both same_names and cache_mapper in "
152
+ "CachingFileSystem.__init__"
153
+ )
154
+ if cache_mapper is not None:
155
+ self._mapper = cache_mapper
156
+ else:
157
+ self._mapper = create_cache_mapper(
158
+ same_names if same_names is not None else False
159
+ )
160
+
161
+ self.target_protocol = (
162
+ target_protocol
163
+ if isinstance(target_protocol, str)
164
+ else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
165
+ )
166
+ self._metadata = CacheMetadata(self.storage)
167
+ self.load_cache()
168
+ self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
169
+
170
+ def _strip_protocol(path):
171
+ # acts as a method, since each instance has a difference target
172
+ return self.fs._strip_protocol(type(self)._strip_protocol(path))
173
+
174
+ self._strip_protocol: Callable = _strip_protocol
175
+
176
+ @staticmethod
177
+ def _remove_tempdir(tempdir):
178
+ try:
179
+ rmtree(tempdir)
180
+ except Exception:
181
+ pass
182
+
183
+ def _mkcache(self):
184
+ os.makedirs(self.storage[-1], exist_ok=True)
185
+
186
+ def cache_size(self):
187
+ """Return size of cache in bytes.
188
+
189
+ If more than one cache directory is in use, only the size of the last
190
+ one (the writable cache directory) is returned.
191
+ """
192
+ if self._cache_size is None:
193
+ cache_dir = self.storage[-1]
194
+ self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
195
+ return self._cache_size
196
+
197
+ def load_cache(self):
198
+ """Read set of stored blocks from file"""
199
+ self._metadata.load()
200
+ self._mkcache()
201
+ self.last_cache = time.time()
202
+
203
+ def save_cache(self):
204
+ """Save set of stored blocks from file"""
205
+ self._mkcache()
206
+ self._metadata.save()
207
+ self.last_cache = time.time()
208
+ self._cache_size = None
209
+
210
+ def _check_cache(self):
211
+ """Reload caches if time elapsed or any disappeared"""
212
+ self._mkcache()
213
+ if not self.cache_check:
214
+ # explicitly told not to bother checking
215
+ return
216
+ timecond = time.time() - self.last_cache > self.cache_check
217
+ existcond = all(os.path.exists(storage) for storage in self.storage)
218
+ if timecond or not existcond:
219
+ self.load_cache()
220
+
221
+ def _check_file(self, path):
222
+ """Is path in cache and still valid"""
223
+ path = self._strip_protocol(path)
224
+ self._check_cache()
225
+ return self._metadata.check_file(path, self)
226
+
227
+ def clear_cache(self):
228
+ """Remove all files and metadata from the cache
229
+
230
+ In the case of multiple cache locations, this clears only the last one,
231
+ which is assumed to be the read/write one.
232
+ """
233
+ rmtree(self.storage[-1])
234
+ self.load_cache()
235
+ self._cache_size = None
236
+
237
+ def clear_expired_cache(self, expiry_time=None):
238
+ """Remove all expired files and metadata from the cache
239
+
240
+ In the case of multiple cache locations, this clears only the last one,
241
+ which is assumed to be the read/write one.
242
+
243
+ Parameters
244
+ ----------
245
+ expiry_time: int
246
+ The time in seconds after which a local copy is considered useless.
247
+ If not defined the default is equivalent to the attribute from the
248
+ file caching instantiation.
249
+ """
250
+
251
+ if not expiry_time:
252
+ expiry_time = self.expiry
253
+
254
+ self._check_cache()
255
+
256
+ expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
257
+ for fn in expired_files:
258
+ if os.path.exists(fn):
259
+ os.remove(fn)
260
+
261
+ if writable_cache_empty:
262
+ rmtree(self.storage[-1])
263
+ self.load_cache()
264
+
265
+ self._cache_size = None
266
+
267
+ def pop_from_cache(self, path):
268
+ """Remove cached version of given file
269
+
270
+ Deletes local copy of the given (remote) path. If it is found in a cache
271
+ location which is not the last, it is assumed to be read-only, and
272
+ raises PermissionError
273
+ """
274
+ path = self._strip_protocol(path)
275
+ fn = self._metadata.pop_file(path)
276
+ if fn is not None:
277
+ os.remove(fn)
278
+ self._cache_size = None
279
+
280
+ def _open(
281
+ self,
282
+ path,
283
+ mode="rb",
284
+ block_size=None,
285
+ autocommit=True,
286
+ cache_options=None,
287
+ **kwargs,
288
+ ):
289
+ """Wrap the target _open
290
+
291
+ If the whole file exists in the cache, just open it locally and
292
+ return that.
293
+
294
+ Otherwise, open the file on the target FS, and make it have a mmap
295
+ cache pointing to the location which we determine, in our cache.
296
+ The ``blocks`` instance is shared, so as the mmap cache instance
297
+ updates, so does the entry in our ``cached_files`` attribute.
298
+ We monkey-patch this file, so that when it closes, we call
299
+ ``close_and_update`` to save the state of the blocks.
300
+ """
301
+ path = self._strip_protocol(path)
302
+
303
+ path = self.fs._strip_protocol(path)
304
+ if "r" not in mode:
305
+ return self.fs._open(
306
+ path,
307
+ mode=mode,
308
+ block_size=block_size,
309
+ autocommit=autocommit,
310
+ cache_options=cache_options,
311
+ **kwargs,
312
+ )
313
+ detail = self._check_file(path)
314
+ if detail:
315
+ # file is in cache
316
+ detail, fn = detail
317
+ hash, blocks = detail["fn"], detail["blocks"]
318
+ if blocks is True:
319
+ # stored file is complete
320
+ logger.debug("Opening local copy of %s", path)
321
+ return open(fn, mode)
322
+ # TODO: action where partial file exists in read-only cache
323
+ logger.debug("Opening partially cached copy of %s", path)
324
+ else:
325
+ hash = self._mapper(path)
326
+ fn = os.path.join(self.storage[-1], hash)
327
+ blocks = set()
328
+ detail = {
329
+ "original": path,
330
+ "fn": hash,
331
+ "blocks": blocks,
332
+ "time": time.time(),
333
+ "uid": self.fs.ukey(path),
334
+ }
335
+ self._metadata.update_file(path, detail)
336
+ logger.debug("Creating local sparse file for %s", path)
337
+
338
+ # call target filesystems open
339
+ self._mkcache()
340
+ f = self.fs._open(
341
+ path,
342
+ mode=mode,
343
+ block_size=block_size,
344
+ autocommit=autocommit,
345
+ cache_options=cache_options,
346
+ cache_type="none",
347
+ **kwargs,
348
+ )
349
+ if self.compression:
350
+ comp = (
351
+ infer_compression(path)
352
+ if self.compression == "infer"
353
+ else self.compression
354
+ )
355
+ f = compr[comp](f, mode="rb")
356
+ if "blocksize" in detail:
357
+ if detail["blocksize"] != f.blocksize:
358
+ raise BlocksizeMismatchError(
359
+ f"Cached file must be reopened with same block"
360
+ f" size as original (old: {detail['blocksize']},"
361
+ f" new {f.blocksize})"
362
+ )
363
+ else:
364
+ detail["blocksize"] = f.blocksize
365
+ f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
366
+ close = f.close
367
+ f.close = lambda: self.close_and_update(f, close)
368
+ self.save_cache()
369
+ return f
370
+
371
+ def _parent(self, path):
372
+ return self.fs._parent(path)
373
+
374
+ def hash_name(self, path: str, *args: Any) -> str:
375
+ # Kept for backward compatibility with downstream libraries.
376
+ # Ignores extra arguments, previously same_name boolean.
377
+ return self._mapper(path)
378
+
379
+ def close_and_update(self, f, close):
380
+ """Called when a file is closing, so store the set of blocks"""
381
+ if f.closed:
382
+ return
383
+ path = self._strip_protocol(f.path)
384
+ self._metadata.on_close_cached_file(f, path)
385
+ try:
386
+ logger.debug("going to save")
387
+ self.save_cache()
388
+ logger.debug("saved")
389
+ except OSError:
390
+ logger.debug("Cache saving failed while closing file")
391
+ except NameError:
392
+ logger.debug("Cache save failed due to interpreter shutdown")
393
+ close()
394
+ f.closed = True
395
+
396
+ def ls(self, path, detail=True):
397
+ return self.fs.ls(path, detail)
398
+
399
+ def __getattribute__(self, item):
400
+ if item in {
401
+ "load_cache",
402
+ "_open",
403
+ "save_cache",
404
+ "close_and_update",
405
+ "__init__",
406
+ "__getattribute__",
407
+ "__reduce__",
408
+ "_make_local_details",
409
+ "open",
410
+ "cat",
411
+ "cat_file",
412
+ "cat_ranges",
413
+ "get",
414
+ "read_block",
415
+ "tail",
416
+ "head",
417
+ "info",
418
+ "ls",
419
+ "exists",
420
+ "isfile",
421
+ "isdir",
422
+ "_check_file",
423
+ "_check_cache",
424
+ "_mkcache",
425
+ "clear_cache",
426
+ "clear_expired_cache",
427
+ "pop_from_cache",
428
+ "_mkcache",
429
+ "local_file",
430
+ "_paths_from_path",
431
+ "get_mapper",
432
+ "open_many",
433
+ "commit_many",
434
+ "hash_name",
435
+ "__hash__",
436
+ "__eq__",
437
+ "to_json",
438
+ "cache_size",
439
+ "pipe_file",
440
+ "pipe",
441
+ "isdir",
442
+ "isfile",
443
+ "exists",
444
+ "start_transaction",
445
+ "end_transaction",
446
+ }:
447
+ # all the methods defined in this class. Note `open` here, since
448
+ # it calls `_open`, but is actually in superclass
449
+ return lambda *args, **kw: getattr(type(self), item).__get__(self)(
450
+ *args, **kw
451
+ )
452
+ if item in ["__reduce_ex__"]:
453
+ raise AttributeError
454
+ if item in ["transaction"]:
455
+ # property
456
+ return type(self).transaction.__get__(self)
457
+ if item in ["_cache", "transaction_type"]:
458
+ # class attributes
459
+ return getattr(type(self), item)
460
+ if item == "__class__":
461
+ return type(self)
462
+ d = object.__getattribute__(self, "__dict__")
463
+ fs = d.get("fs", None) # fs is not immediately defined
464
+ if item in d:
465
+ return d[item]
466
+ elif fs is not None:
467
+ if item in fs.__dict__:
468
+ # attribute of instance
469
+ return fs.__dict__[item]
470
+ # attributed belonging to the target filesystem
471
+ cls = type(fs)
472
+ m = getattr(cls, item)
473
+ if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
474
+ not hasattr(m, "__self__") or m.__self__ is None
475
+ ):
476
+ # instance method
477
+ return m.__get__(fs, cls)
478
+ return m # class method or attribute
479
+ else:
480
+ # attributes of the superclass, while target is being set up
481
+ return super().__getattribute__(item)
482
+
483
+ def __eq__(self, other):
484
+ """Test for equality."""
485
+ if self is other:
486
+ return True
487
+ if not isinstance(other, type(self)):
488
+ return False
489
+ return (
490
+ self.storage == other.storage
491
+ and self.kwargs == other.kwargs
492
+ and self.cache_check == other.cache_check
493
+ and self.check_files == other.check_files
494
+ and self.expiry == other.expiry
495
+ and self.compression == other.compression
496
+ and self._mapper == other._mapper
497
+ and self.target_protocol == other.target_protocol
498
+ )
499
+
500
+ def __hash__(self):
501
+ """Calculate hash."""
502
+ return (
503
+ hash(tuple(self.storage))
504
+ ^ hash(str(self.kwargs))
505
+ ^ hash(self.cache_check)
506
+ ^ hash(self.check_files)
507
+ ^ hash(self.expiry)
508
+ ^ hash(self.compression)
509
+ ^ hash(self._mapper)
510
+ ^ hash(self.target_protocol)
511
+ )
512
+
513
+ def to_json(self):
514
+ """Calculate JSON representation.
515
+
516
+ Not implemented yet for CachingFileSystem.
517
+ """
518
+ raise NotImplementedError(
519
+ "CachingFileSystem JSON representation not implemented"
520
+ )
521
+
522
+
523
+ class WholeFileCacheFileSystem(CachingFileSystem):
524
+ """Caches whole remote files on first access
525
+
526
+ This class is intended as a layer over any other file system, and
527
+ will make a local copy of each file accessed, so that all subsequent
528
+ reads are local. This is similar to ``CachingFileSystem``, but without
529
+ the block-wise functionality and so can work even when sparse files
530
+ are not allowed. See its docstring for definition of the init
531
+ arguments.
532
+
533
+ The class still needs access to the remote store for listing files,
534
+ and may refresh cached files.
535
+ """
536
+
537
+ protocol = "filecache"
538
+ local_file = True
539
+
540
+ def open_many(self, open_files, **kwargs):
541
+ paths = [of.path for of in open_files]
542
+ if "r" in open_files.mode:
543
+ self._mkcache()
544
+ else:
545
+ return [
546
+ LocalTempFile(
547
+ self.fs,
548
+ path,
549
+ mode=open_files.mode,
550
+ fn=os.path.join(self.storage[-1], self._mapper(path)),
551
+ **kwargs,
552
+ )
553
+ for path in paths
554
+ ]
555
+
556
+ if self.compression:
557
+ raise NotImplementedError
558
+ details = [self._check_file(sp) for sp in paths]
559
+ downpath = [p for p, d in zip(paths, details) if not d]
560
+ downfn0 = [
561
+ os.path.join(self.storage[-1], self._mapper(p))
562
+ for p, d in zip(paths, details)
563
+ ] # keep these path names for opening later
564
+ downfn = [fn for fn, d in zip(downfn0, details) if not d]
565
+ if downpath:
566
+ # skip if all files are already cached and up to date
567
+ self.fs.get(downpath, downfn)
568
+
569
+ # update metadata - only happens when downloads are successful
570
+ newdetail = [
571
+ {
572
+ "original": path,
573
+ "fn": self._mapper(path),
574
+ "blocks": True,
575
+ "time": time.time(),
576
+ "uid": self.fs.ukey(path),
577
+ }
578
+ for path in downpath
579
+ ]
580
+ for path, detail in zip(downpath, newdetail):
581
+ self._metadata.update_file(path, detail)
582
+ self.save_cache()
583
+
584
+ def firstpart(fn):
585
+ # helper to adapt both whole-file and simple-cache
586
+ return fn[1] if isinstance(fn, tuple) else fn
587
+
588
+ return [
589
+ open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
590
+ for fn0, fn1 in zip(details, downfn0)
591
+ ]
592
+
593
+ def commit_many(self, open_files):
594
+ self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
595
+ [f.close() for f in open_files]
596
+ for f in open_files:
597
+ # in case autocommit is off, and so close did not already delete
598
+ try:
599
+ os.remove(f.name)
600
+ except FileNotFoundError:
601
+ pass
602
+ self._cache_size = None
603
+
604
+ def _make_local_details(self, path):
605
+ hash = self._mapper(path)
606
+ fn = os.path.join(self.storage[-1], hash)
607
+ detail = {
608
+ "original": path,
609
+ "fn": hash,
610
+ "blocks": True,
611
+ "time": time.time(),
612
+ "uid": self.fs.ukey(path),
613
+ }
614
+ self._metadata.update_file(path, detail)
615
+ logger.debug("Copying %s to local cache", path)
616
+ return fn
617
+
618
+ def cat(
619
+ self,
620
+ path,
621
+ recursive=False,
622
+ on_error="raise",
623
+ callback=DEFAULT_CALLBACK,
624
+ **kwargs,
625
+ ):
626
+ paths = self.expand_path(
627
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
628
+ )
629
+ getpaths = []
630
+ storepaths = []
631
+ fns = []
632
+ out = {}
633
+ for p in paths.copy():
634
+ try:
635
+ detail = self._check_file(p)
636
+ if not detail:
637
+ fn = self._make_local_details(p)
638
+ getpaths.append(p)
639
+ storepaths.append(fn)
640
+ else:
641
+ detail, fn = detail if isinstance(detail, tuple) else (None, detail)
642
+ fns.append(fn)
643
+ except Exception as e:
644
+ if on_error == "raise":
645
+ raise
646
+ if on_error == "return":
647
+ out[p] = e
648
+ paths.remove(p)
649
+
650
+ if getpaths:
651
+ self.fs.get(getpaths, storepaths)
652
+ self.save_cache()
653
+
654
+ callback.set_size(len(paths))
655
+ for p, fn in zip(paths, fns):
656
+ with open(fn, "rb") as f:
657
+ out[p] = f.read()
658
+ callback.relative_update(1)
659
+ if isinstance(path, str) and len(paths) == 1 and recursive is False:
660
+ out = out[paths[0]]
661
+ return out
662
+
663
+ def _open(self, path, mode="rb", **kwargs):
664
+ path = self._strip_protocol(path)
665
+ if "r" not in mode:
666
+ fn = self._make_local_details(path)
667
+ user_specified_kwargs = {
668
+ k: v
669
+ for k, v in kwargs.items()
670
+ # those kwargs were added by open(), we don't want them
671
+ if k not in ["autocommit", "block_size", "cache_options"]
672
+ }
673
+ return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
674
+ detail = self._check_file(path)
675
+ if detail:
676
+ detail, fn = detail
677
+ _, blocks = detail["fn"], detail["blocks"]
678
+ if blocks is True:
679
+ logger.debug("Opening local copy of %s", path)
680
+
681
+ # In order to support downstream filesystems to be able to
682
+ # infer the compression from the original filename, like
683
+ # the `TarFileSystem`, let's extend the `io.BufferedReader`
684
+ # fileobject protocol by adding a dedicated attribute
685
+ # `original`.
686
+ f = open(fn, mode)
687
+ f.original = detail.get("original")
688
+ return f
689
+ else:
690
+ raise ValueError(
691
+ f"Attempt to open partially cached file {path}"
692
+ f" as a wholly cached file"
693
+ )
694
+ else:
695
+ fn = self._make_local_details(path)
696
+ kwargs["mode"] = mode
697
+
698
+ # call target filesystems open
699
+ self._mkcache()
700
+ if self.compression:
701
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
702
+ if isinstance(f, AbstractBufferedFile):
703
+ # want no type of caching if just downloading whole thing
704
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
705
+ comp = (
706
+ infer_compression(path)
707
+ if self.compression == "infer"
708
+ else self.compression
709
+ )
710
+ f = compr[comp](f, mode="rb")
711
+ data = True
712
+ while data:
713
+ block = getattr(f, "blocksize", 5 * 2**20)
714
+ data = f.read(block)
715
+ f2.write(data)
716
+ else:
717
+ self.fs.get_file(path, fn)
718
+ self.save_cache()
719
+ return self._open(path, mode)
720
+
721
+
722
+ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
723
+ """Caches whole remote files on first access
724
+
725
+ This class is intended as a layer over any other file system, and
726
+ will make a local copy of each file accessed, so that all subsequent
727
+ reads are local. This implementation only copies whole files, and
728
+ does not keep any metadata about the download time or file details.
729
+ It is therefore safer to use in multi-threaded/concurrent situations.
730
+
731
+ This is the only of the caching filesystems that supports write: you will
732
+ be given a real local open file, and upon close and commit, it will be
733
+ uploaded to the target filesystem; the writability or the target URL is
734
+ not checked until that time.
735
+
736
+ """
737
+
738
+ protocol = "simplecache"
739
+ local_file = True
740
+ transaction_type = WriteCachedTransaction
741
+
742
+ def __init__(self, **kwargs):
743
+ kw = kwargs.copy()
744
+ for key in ["cache_check", "expiry_time", "check_files"]:
745
+ kw[key] = False
746
+ super().__init__(**kw)
747
+ for storage in self.storage:
748
+ if not os.path.exists(storage):
749
+ os.makedirs(storage, exist_ok=True)
750
+
751
+ def _check_file(self, path):
752
+ self._check_cache()
753
+ sha = self._mapper(path)
754
+ for storage in self.storage:
755
+ fn = os.path.join(storage, sha)
756
+ if os.path.exists(fn):
757
+ return fn
758
+
759
+ def save_cache(self):
760
+ pass
761
+
762
+ def load_cache(self):
763
+ pass
764
+
765
+ def pipe_file(self, path, value=None, **kwargs):
766
+ if self._intrans:
767
+ with self.open(path, "wb") as f:
768
+ f.write(value)
769
+ else:
770
+ super().pipe_file(path, value)
771
+
772
+ def ls(self, path, detail=True, **kwargs):
773
+ path = self._strip_protocol(path)
774
+ details = []
775
+ try:
776
+ details = self.fs.ls(
777
+ path, detail=True, **kwargs
778
+ ).copy() # don't edit original!
779
+ except FileNotFoundError as e:
780
+ ex = e
781
+ else:
782
+ ex = None
783
+ if self._intrans:
784
+ path1 = path.rstrip("/") + "/"
785
+ for f in self.transaction.files:
786
+ if f.path == path:
787
+ details.append(
788
+ {"name": path, "size": f.size or f.tell(), "type": "file"}
789
+ )
790
+ elif f.path.startswith(path1):
791
+ if f.path.count("/") == path1.count("/"):
792
+ details.append(
793
+ {"name": f.path, "size": f.size or f.tell(), "type": "file"}
794
+ )
795
+ else:
796
+ dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
797
+ details.append({"name": dname, "size": 0, "type": "directory"})
798
+ if ex is not None and not details:
799
+ raise ex
800
+ if detail:
801
+ return details
802
+ return sorted(_["name"] for _ in details)
803
+
804
+ def info(self, path, **kwargs):
805
+ path = self._strip_protocol(path)
806
+ if self._intrans:
807
+ f = [_ for _ in self.transaction.files if _.path == path]
808
+ if f:
809
+ return {"name": path, "size": f[0].size or f[0].tell(), "type": "file"}
810
+ f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
811
+ if f:
812
+ return {"name": path, "size": 0, "type": "directory"}
813
+ return self.fs.info(path, **kwargs)
814
+
815
+ def pipe(self, path, value=None, **kwargs):
816
+ if isinstance(path, str):
817
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
818
+ elif isinstance(path, dict):
819
+ for k, v in path.items():
820
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
821
+ else:
822
+ raise ValueError("path must be str or dict")
823
+
824
+ def cat_ranges(
825
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
826
+ ):
827
+ lpaths = [self._check_file(p) for p in paths]
828
+ rpaths = [p for l, p in zip(lpaths, paths) if l is False]
829
+ lpaths = [l for l, p in zip(lpaths, paths) if l is False]
830
+ self.fs.get(rpaths, lpaths)
831
+ return super().cat_ranges(
832
+ paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
833
+ )
834
+
835
+ def _open(self, path, mode="rb", **kwargs):
836
+ path = self._strip_protocol(path)
837
+ sha = self._mapper(path)
838
+
839
+ if "r" not in mode:
840
+ fn = os.path.join(self.storage[-1], sha)
841
+ user_specified_kwargs = {
842
+ k: v
843
+ for k, v in kwargs.items()
844
+ if k not in ["autocommit", "block_size", "cache_options"]
845
+ } # those were added by open()
846
+ return LocalTempFile(
847
+ self,
848
+ path,
849
+ mode=mode,
850
+ autocommit=not self._intrans,
851
+ fn=fn,
852
+ **user_specified_kwargs,
853
+ )
854
+ fn = self._check_file(path)
855
+ if fn:
856
+ return open(fn, mode)
857
+
858
+ fn = os.path.join(self.storage[-1], sha)
859
+ logger.debug("Copying %s to local cache", path)
860
+ kwargs["mode"] = mode
861
+
862
+ self._mkcache()
863
+ self._cache_size = None
864
+ if self.compression:
865
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
866
+ if isinstance(f, AbstractBufferedFile):
867
+ # want no type of caching if just downloading whole thing
868
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
869
+ comp = (
870
+ infer_compression(path)
871
+ if self.compression == "infer"
872
+ else self.compression
873
+ )
874
+ f = compr[comp](f, mode="rb")
875
+ data = True
876
+ while data:
877
+ block = getattr(f, "blocksize", 5 * 2**20)
878
+ data = f.read(block)
879
+ f2.write(data)
880
+ else:
881
+ self.fs.get_file(path, fn)
882
+ return self._open(path, mode)
883
+
884
+
885
+ class LocalTempFile:
886
+ """A temporary local file, which will be uploaded on commit"""
887
+
888
+ def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
889
+ self.fn = fn
890
+ self.fh = open(fn, mode)
891
+ self.mode = mode
892
+ if seek:
893
+ self.fh.seek(seek)
894
+ self.path = path
895
+ self.size = None
896
+ self.fs = fs
897
+ self.closed = False
898
+ self.autocommit = autocommit
899
+ self.kwargs = kwargs
900
+
901
+ def __reduce__(self):
902
+ # always open in r+b to allow continuing writing at a location
903
+ return (
904
+ LocalTempFile,
905
+ (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
906
+ )
907
+
908
+ def __enter__(self):
909
+ return self.fh
910
+
911
+ def __exit__(self, exc_type, exc_val, exc_tb):
912
+ self.close()
913
+
914
+ def close(self):
915
+ self.size = self.fh.tell()
916
+ if self.closed:
917
+ return
918
+ self.fh.close()
919
+ self.closed = True
920
+ if self.autocommit:
921
+ self.commit()
922
+
923
+ def discard(self):
924
+ self.fh.close()
925
+ os.remove(self.fn)
926
+
927
+ def commit(self):
928
+ self.fs.put(self.fn, self.path, **self.kwargs)
929
+ # we do not delete local copy - it's still in the cache
930
+
931
+ @property
932
+ def name(self):
933
+ return self.fn
934
+
935
+ def __repr__(self) -> str:
936
+ return f"LocalTempFile: {self.path}"
937
+
938
+ def __getattr__(self, item):
939
+ return getattr(self.fh, item)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dask
2
+ from distributed.client import Client, _get_global_client
3
+ from distributed.worker import Worker
4
+
5
+ from fsspec import filesystem
6
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
7
+ from fsspec.utils import infer_storage_options
8
+
9
+
10
+ def _get_client(client):
11
+ if client is None:
12
+ return _get_global_client()
13
+ elif isinstance(client, Client):
14
+ return client
15
+ else:
16
+ # e.g., connection string
17
+ return Client(client)
18
+
19
+
20
+ def _in_worker():
21
+ return bool(Worker._instances)
22
+
23
+
24
+ class DaskWorkerFileSystem(AbstractFileSystem):
25
+ """View files accessible to a worker as any other remote file-system
26
+
27
+ When instances are run on the worker, uses the real filesystem. When
28
+ run on the client, they call the worker to provide information or data.
29
+
30
+ **Warning** this implementation is experimental, and read-only for now.
31
+ """
32
+
33
+ def __init__(
34
+ self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
35
+ ):
36
+ super().__init__(**kwargs)
37
+ if not (fs is None) ^ (target_protocol is None):
38
+ raise ValueError(
39
+ "Please provide one of filesystem instance (fs) or"
40
+ " target_protocol, not both"
41
+ )
42
+ self.target_protocol = target_protocol
43
+ self.target_options = target_options
44
+ self.worker = None
45
+ self.client = client
46
+ self.fs = fs
47
+ self._determine_worker()
48
+
49
+ @staticmethod
50
+ def _get_kwargs_from_urls(path):
51
+ so = infer_storage_options(path)
52
+ if "host" in so and "port" in so:
53
+ return {"client": f"{so['host']}:{so['port']}"}
54
+ else:
55
+ return {}
56
+
57
+ def _determine_worker(self):
58
+ if _in_worker():
59
+ self.worker = True
60
+ if self.fs is None:
61
+ self.fs = filesystem(
62
+ self.target_protocol, **(self.target_options or {})
63
+ )
64
+ else:
65
+ self.worker = False
66
+ self.client = _get_client(self.client)
67
+ self.rfs = dask.delayed(self)
68
+
69
+ def mkdir(self, *args, **kwargs):
70
+ if self.worker:
71
+ self.fs.mkdir(*args, **kwargs)
72
+ else:
73
+ self.rfs.mkdir(*args, **kwargs).compute()
74
+
75
+ def rm(self, *args, **kwargs):
76
+ if self.worker:
77
+ self.fs.rm(*args, **kwargs)
78
+ else:
79
+ self.rfs.rm(*args, **kwargs).compute()
80
+
81
+ def copy(self, *args, **kwargs):
82
+ if self.worker:
83
+ self.fs.copy(*args, **kwargs)
84
+ else:
85
+ self.rfs.copy(*args, **kwargs).compute()
86
+
87
+ def mv(self, *args, **kwargs):
88
+ if self.worker:
89
+ self.fs.mv(*args, **kwargs)
90
+ else:
91
+ self.rfs.mv(*args, **kwargs).compute()
92
+
93
+ def ls(self, *args, **kwargs):
94
+ if self.worker:
95
+ return self.fs.ls(*args, **kwargs)
96
+ else:
97
+ return self.rfs.ls(*args, **kwargs).compute()
98
+
99
+ def _open(
100
+ self,
101
+ path,
102
+ mode="rb",
103
+ block_size=None,
104
+ autocommit=True,
105
+ cache_options=None,
106
+ **kwargs,
107
+ ):
108
+ if self.worker:
109
+ return self.fs._open(
110
+ path,
111
+ mode=mode,
112
+ block_size=block_size,
113
+ autocommit=autocommit,
114
+ cache_options=cache_options,
115
+ **kwargs,
116
+ )
117
+ else:
118
+ return DaskFile(
119
+ fs=self,
120
+ path=path,
121
+ mode=mode,
122
+ block_size=block_size,
123
+ autocommit=autocommit,
124
+ cache_options=cache_options,
125
+ **kwargs,
126
+ )
127
+
128
+ def fetch_range(self, path, mode, start, end):
129
+ if self.worker:
130
+ with self._open(path, mode) as f:
131
+ f.seek(start)
132
+ return f.read(end - start)
133
+ else:
134
+ return self.rfs.fetch_range(path, mode, start, end).compute()
135
+
136
+
137
+ class DaskFile(AbstractBufferedFile):
138
+ def __init__(self, mode="rb", **kwargs):
139
+ if mode != "rb":
140
+ raise ValueError('Remote dask files can only be opened in "rb" mode')
141
+ super().__init__(**kwargs)
142
+
143
+ def _upload_chunk(self, final=False):
144
+ pass
145
+
146
+ def _initiate_upload(self):
147
+ """Create remote file/upload"""
148
+ pass
149
+
150
+ def _fetch_range(self, start, end):
151
+ """Get the specified set of bytes from remote"""
152
+ return self.fs.fetch_range(self.path, self.mode, start, end)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ from typing import Optional
4
+ from urllib.parse import unquote
5
+
6
+ from fsspec import AbstractFileSystem
7
+
8
+
9
+ class DataFileSystem(AbstractFileSystem):
10
+ """A handy decoder for data-URLs
11
+
12
+ Example
13
+ -------
14
+ >>> with fsspec.open("data:,Hello%2C%20World%21") as f:
15
+ ... print(f.read())
16
+ b"Hello, World!"
17
+
18
+ See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
19
+ """
20
+
21
+ protocol = "data"
22
+
23
+ def __init__(self, **kwargs):
24
+ """No parameters for this filesystem"""
25
+ super().__init__(**kwargs)
26
+
27
+ def cat_file(self, path, start=None, end=None, **kwargs):
28
+ pref, data = path.split(",", 1)
29
+ if pref.endswith("base64"):
30
+ return base64.b64decode(data)[start:end]
31
+ return unquote(data).encode()[start:end]
32
+
33
+ def info(self, path, **kwargs):
34
+ pref, name = path.split(",", 1)
35
+ data = self.cat_file(path)
36
+ mime = pref.split(":", 1)[1].split(";", 1)[0]
37
+ return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
38
+
39
+ def _open(
40
+ self,
41
+ path,
42
+ mode="rb",
43
+ block_size=None,
44
+ autocommit=True,
45
+ cache_options=None,
46
+ **kwargs,
47
+ ):
48
+ if "r" not in mode:
49
+ raise ValueError("Read only filesystem")
50
+ return io.BytesIO(self.cat_file(path))
51
+
52
+ @staticmethod
53
+ def encode(data: bytes, mime: Optional[str] = None):
54
+ """Format the given data into data-URL syntax
55
+
56
+ This version always base64 encodes, even when the data is ascii/url-safe.
57
+ """
58
+ return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import urllib
3
+
4
+ import requests
5
+ import requests.exceptions
6
+ from requests.adapters import HTTPAdapter, Retry
7
+
8
+ from fsspec import AbstractFileSystem
9
+ from fsspec.spec import AbstractBufferedFile
10
+
11
+
12
+ class DatabricksException(Exception):
13
+ """
14
+ Helper class for exceptions raised in this module.
15
+ """
16
+
17
+ def __init__(self, error_code, message):
18
+ """Create a new DatabricksException"""
19
+ super().__init__(message)
20
+
21
+ self.error_code = error_code
22
+ self.message = message
23
+
24
+
25
+ class DatabricksFileSystem(AbstractFileSystem):
26
+ """
27
+ Get access to the Databricks filesystem implementation over HTTP.
28
+ Can be used inside and outside of a databricks cluster.
29
+ """
30
+
31
+ def __init__(self, instance, token, **kwargs):
32
+ """
33
+ Create a new DatabricksFileSystem.
34
+
35
+ Parameters
36
+ ----------
37
+ instance: str
38
+ The instance URL of the databricks cluster.
39
+ For example for an Azure databricks cluster, this
40
+ has the form adb-<some-number>.<two digits>.azuredatabricks.net.
41
+ token: str
42
+ Your personal token. Find out more
43
+ here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
44
+ """
45
+ self.instance = instance
46
+ self.token = token
47
+ self.session = requests.Session()
48
+ self.retries = Retry(
49
+ total=10,
50
+ backoff_factor=0.05,
51
+ status_forcelist=[408, 429, 500, 502, 503, 504],
52
+ )
53
+
54
+ self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
55
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
56
+
57
+ super().__init__(**kwargs)
58
+
59
+ def ls(self, path, detail=True, **kwargs):
60
+ """
61
+ List the contents of the given path.
62
+
63
+ Parameters
64
+ ----------
65
+ path: str
66
+ Absolute path
67
+ detail: bool
68
+ Return not only the list of filenames,
69
+ but also additional information on file sizes
70
+ and types.
71
+ """
72
+ out = self._ls_from_cache(path)
73
+ if not out:
74
+ try:
75
+ r = self._send_to_api(
76
+ method="get", endpoint="list", json={"path": path}
77
+ )
78
+ except DatabricksException as e:
79
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
80
+ raise FileNotFoundError(e.message)
81
+
82
+ raise e
83
+ files = r["files"]
84
+ out = [
85
+ {
86
+ "name": o["path"],
87
+ "type": "directory" if o["is_dir"] else "file",
88
+ "size": o["file_size"],
89
+ }
90
+ for o in files
91
+ ]
92
+ self.dircache[path] = out
93
+
94
+ if detail:
95
+ return out
96
+ return [o["name"] for o in out]
97
+
98
+ def makedirs(self, path, exist_ok=True):
99
+ """
100
+ Create a given absolute path and all of its parents.
101
+
102
+ Parameters
103
+ ----------
104
+ path: str
105
+ Absolute path to create
106
+ exist_ok: bool
107
+ If false, checks if the folder
108
+ exists before creating it (and raises an
109
+ Exception if this is the case)
110
+ """
111
+ if not exist_ok:
112
+ try:
113
+ # If the following succeeds, the path is already present
114
+ self._send_to_api(
115
+ method="get", endpoint="get-status", json={"path": path}
116
+ )
117
+ raise FileExistsError(f"Path {path} already exists")
118
+ except DatabricksException as e:
119
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
120
+ pass
121
+
122
+ try:
123
+ self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
124
+ except DatabricksException as e:
125
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
126
+ raise FileExistsError(e.message)
127
+
128
+ raise e
129
+ self.invalidate_cache(self._parent(path))
130
+
131
+ def mkdir(self, path, create_parents=True, **kwargs):
132
+ """
133
+ Create a given absolute path and all of its parents.
134
+
135
+ Parameters
136
+ ----------
137
+ path: str
138
+ Absolute path to create
139
+ create_parents: bool
140
+ Whether to create all parents or not.
141
+ "False" is not implemented so far.
142
+ """
143
+ if not create_parents:
144
+ raise NotImplementedError
145
+
146
+ self.mkdirs(path, **kwargs)
147
+
148
+ def rm(self, path, recursive=False, **kwargs):
149
+ """
150
+ Remove the file or folder at the given absolute path.
151
+
152
+ Parameters
153
+ ----------
154
+ path: str
155
+ Absolute path what to remove
156
+ recursive: bool
157
+ Recursively delete all files in a folder.
158
+ """
159
+ try:
160
+ self._send_to_api(
161
+ method="post",
162
+ endpoint="delete",
163
+ json={"path": path, "recursive": recursive},
164
+ )
165
+ except DatabricksException as e:
166
+ # This is not really an exception, it just means
167
+ # not everything was deleted so far
168
+ if e.error_code == "PARTIAL_DELETE":
169
+ self.rm(path=path, recursive=recursive)
170
+ elif e.error_code == "IO_ERROR":
171
+ # Using the same exception as the os module would use here
172
+ raise OSError(e.message)
173
+
174
+ raise e
175
+ self.invalidate_cache(self._parent(path))
176
+
177
+ def mv(
178
+ self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
179
+ ):
180
+ """
181
+ Move a source to a destination path.
182
+
183
+ A note from the original [databricks API manual]
184
+ (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
185
+
186
+ When moving a large number of files the API call will time out after
187
+ approximately 60s, potentially resulting in partially moved data.
188
+ Therefore, for operations that move more than 10k files, we strongly
189
+ discourage using the DBFS REST API.
190
+
191
+ Parameters
192
+ ----------
193
+ source_path: str
194
+ From where to move (absolute path)
195
+ destination_path: str
196
+ To where to move (absolute path)
197
+ recursive: bool
198
+ Not implemented to far.
199
+ maxdepth:
200
+ Not implemented to far.
201
+ """
202
+ if recursive:
203
+ raise NotImplementedError
204
+ if maxdepth:
205
+ raise NotImplementedError
206
+
207
+ try:
208
+ self._send_to_api(
209
+ method="post",
210
+ endpoint="move",
211
+ json={"source_path": source_path, "destination_path": destination_path},
212
+ )
213
+ except DatabricksException as e:
214
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
215
+ raise FileNotFoundError(e.message)
216
+ elif e.error_code == "RESOURCE_ALREADY_EXISTS":
217
+ raise FileExistsError(e.message)
218
+
219
+ raise e
220
+ self.invalidate_cache(self._parent(source_path))
221
+ self.invalidate_cache(self._parent(destination_path))
222
+
223
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
224
+ """
225
+ Overwrite the base class method to make sure to create a DBFile.
226
+ All arguments are copied from the base method.
227
+
228
+ Only the default blocksize is allowed.
229
+ """
230
+ return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
231
+
232
+ def _send_to_api(self, method, endpoint, json):
233
+ """
234
+ Send the given json to the DBFS API
235
+ using a get or post request (specified by the argument `method`).
236
+
237
+ Parameters
238
+ ----------
239
+ method: str
240
+ Which http method to use for communication; "get" or "post".
241
+ endpoint: str
242
+ Where to send the request to (last part of the API URL)
243
+ json: dict
244
+ Dictionary of information to send
245
+ """
246
+ if method == "post":
247
+ session_call = self.session.post
248
+ elif method == "get":
249
+ session_call = self.session.get
250
+ else:
251
+ raise ValueError(f"Do not understand method {method}")
252
+
253
+ url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
254
+
255
+ r = session_call(url, json=json)
256
+
257
+ # The DBFS API will return a json, also in case of an exception.
258
+ # We want to preserve this information as good as possible.
259
+ try:
260
+ r.raise_for_status()
261
+ except requests.HTTPError as e:
262
+ # try to extract json error message
263
+ # if that fails, fall back to the original exception
264
+ try:
265
+ exception_json = e.response.json()
266
+ except Exception:
267
+ raise e
268
+
269
+ raise DatabricksException(**exception_json)
270
+
271
+ return r.json()
272
+
273
+ def _create_handle(self, path, overwrite=True):
274
+ """
275
+ Internal function to create a handle, which can be used to
276
+ write blocks of a file to DBFS.
277
+ A handle has a unique identifier which needs to be passed
278
+ whenever written during this transaction.
279
+ The handle is active for 10 minutes - after that a new
280
+ write transaction needs to be created.
281
+ Make sure to close the handle after you are finished.
282
+
283
+ Parameters
284
+ ----------
285
+ path: str
286
+ Absolute path for this file.
287
+ overwrite: bool
288
+ If a file already exist at this location, either overwrite
289
+ it or raise an exception.
290
+ """
291
+ try:
292
+ r = self._send_to_api(
293
+ method="post",
294
+ endpoint="create",
295
+ json={"path": path, "overwrite": overwrite},
296
+ )
297
+ return r["handle"]
298
+ except DatabricksException as e:
299
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
300
+ raise FileExistsError(e.message)
301
+
302
+ raise e
303
+
304
+ def _close_handle(self, handle):
305
+ """
306
+ Close a handle, which was opened by :func:`_create_handle`.
307
+
308
+ Parameters
309
+ ----------
310
+ handle: str
311
+ Which handle to close.
312
+ """
313
+ try:
314
+ self._send_to_api(method="post", endpoint="close", json={"handle": handle})
315
+ except DatabricksException as e:
316
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
317
+ raise FileNotFoundError(e.message)
318
+
319
+ raise e
320
+
321
+ def _add_data(self, handle, data):
322
+ """
323
+ Upload data to an already opened file handle
324
+ (opened by :func:`_create_handle`).
325
+ The maximal allowed data size is 1MB after
326
+ conversion to base64.
327
+ Remember to close the handle when you are finished.
328
+
329
+ Parameters
330
+ ----------
331
+ handle: str
332
+ Which handle to upload data to.
333
+ data: bytes
334
+ Block of data to add to the handle.
335
+ """
336
+ data = base64.b64encode(data).decode()
337
+ try:
338
+ self._send_to_api(
339
+ method="post",
340
+ endpoint="add-block",
341
+ json={"handle": handle, "data": data},
342
+ )
343
+ except DatabricksException as e:
344
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
345
+ raise FileNotFoundError(e.message)
346
+ elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
347
+ raise ValueError(e.message)
348
+
349
+ raise e
350
+
351
+ def _get_data(self, path, start, end):
352
+ """
353
+ Download data in bytes from a given absolute path in a block
354
+ from [start, start+length].
355
+ The maximum number of allowed bytes to read is 1MB.
356
+
357
+ Parameters
358
+ ----------
359
+ path: str
360
+ Absolute path to download data from
361
+ start: int
362
+ Start position of the block
363
+ end: int
364
+ End position of the block
365
+ """
366
+ try:
367
+ r = self._send_to_api(
368
+ method="get",
369
+ endpoint="read",
370
+ json={"path": path, "offset": start, "length": end - start},
371
+ )
372
+ return base64.b64decode(r["data"])
373
+ except DatabricksException as e:
374
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
375
+ raise FileNotFoundError(e.message)
376
+ elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
377
+ raise ValueError(e.message)
378
+
379
+ raise e
380
+
381
+ def invalidate_cache(self, path=None):
382
+ if path is None:
383
+ self.dircache.clear()
384
+ else:
385
+ self.dircache.pop(path, None)
386
+ super().invalidate_cache(path)
387
+
388
+
389
+ class DatabricksFile(AbstractBufferedFile):
390
+ """
391
+ Helper class for files referenced in the DatabricksFileSystem.
392
+ """
393
+
394
+ DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
395
+
396
+ def __init__(
397
+ self,
398
+ fs,
399
+ path,
400
+ mode="rb",
401
+ block_size="default",
402
+ autocommit=True,
403
+ cache_type="readahead",
404
+ cache_options=None,
405
+ **kwargs,
406
+ ):
407
+ """
408
+ Create a new instance of the DatabricksFile.
409
+
410
+ The blocksize needs to be the default one.
411
+ """
412
+ if block_size is None or block_size == "default":
413
+ block_size = self.DEFAULT_BLOCK_SIZE
414
+
415
+ assert (
416
+ block_size == self.DEFAULT_BLOCK_SIZE
417
+ ), f"Only the default block size is allowed, not {block_size}"
418
+
419
+ super().__init__(
420
+ fs,
421
+ path,
422
+ mode=mode,
423
+ block_size=block_size,
424
+ autocommit=autocommit,
425
+ cache_type=cache_type,
426
+ cache_options=cache_options or {},
427
+ **kwargs,
428
+ )
429
+
430
+ def _initiate_upload(self):
431
+ """Internal function to start a file upload"""
432
+ self.handle = self.fs._create_handle(self.path)
433
+
434
+ def _upload_chunk(self, final=False):
435
+ """Internal function to add a chunk of data to a started upload"""
436
+ self.buffer.seek(0)
437
+ data = self.buffer.getvalue()
438
+
439
+ data_chunks = [
440
+ data[start:end] for start, end in self._to_sized_blocks(len(data))
441
+ ]
442
+
443
+ for data_chunk in data_chunks:
444
+ self.fs._add_data(handle=self.handle, data=data_chunk)
445
+
446
+ if final:
447
+ self.fs._close_handle(handle=self.handle)
448
+ return True
449
+
450
+ def _fetch_range(self, start, end):
451
+ """Internal function to download a block of data"""
452
+ return_buffer = b""
453
+ length = end - start
454
+ for chunk_start, chunk_end in self._to_sized_blocks(length, start):
455
+ return_buffer += self.fs._get_data(
456
+ path=self.path, start=chunk_start, end=chunk_end
457
+ )
458
+
459
+ return return_buffer
460
+
461
+ def _to_sized_blocks(self, length, start=0):
462
+ """Helper function to split a range from 0 to total_length into bloksizes"""
463
+ end = start + length
464
+ for data_chunk in range(start, end, self.blocksize):
465
+ data_start = data_chunk
466
+ data_end = min(end, data_chunk + self.blocksize)
467
+ yield data_start, data_end
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import filesystem
2
+ from ..asyn import AsyncFileSystem
3
+
4
+
5
+ class DirFileSystem(AsyncFileSystem):
6
+ """Directory prefix filesystem
7
+
8
+ The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
9
+ is relative to the `path`. After performing the necessary paths operation it
10
+ delegates everything to the wrapped filesystem.
11
+ """
12
+
13
+ protocol = "dir"
14
+
15
+ def __init__(
16
+ self,
17
+ path=None,
18
+ fs=None,
19
+ fo=None,
20
+ target_protocol=None,
21
+ target_options=None,
22
+ **storage_options,
23
+ ):
24
+ """
25
+ Parameters
26
+ ----------
27
+ path: str
28
+ Path to the directory.
29
+ fs: AbstractFileSystem
30
+ An instantiated filesystem to wrap.
31
+ target_protocol, target_options:
32
+ if fs is none, construct it from these
33
+ fo: str
34
+ Alternate for path; do not provide both
35
+ """
36
+ super().__init__(**storage_options)
37
+ if fs is None:
38
+ fs = filesystem(protocol=target_protocol, **(target_options or {}))
39
+ if (path is not None) ^ (fo is not None) is False:
40
+ raise ValueError("Provide path or fo, not both")
41
+ path = path or fo
42
+
43
+ if self.asynchronous and not fs.async_impl:
44
+ raise ValueError("can't use asynchronous with non-async fs")
45
+
46
+ if fs.async_impl and self.asynchronous != fs.asynchronous:
47
+ raise ValueError("both dirfs and fs should be in the same sync/async mode")
48
+
49
+ self.path = fs._strip_protocol(path)
50
+ self.fs = fs
51
+
52
+ def _join(self, path):
53
+ if isinstance(path, str):
54
+ if not self.path:
55
+ return path
56
+ if not path:
57
+ return self.path
58
+ return self.fs.sep.join((self.path, self._strip_protocol(path)))
59
+ return [self._join(_path) for _path in path]
60
+
61
+ def _relpath(self, path):
62
+ if isinstance(path, str):
63
+ if not self.path:
64
+ return path
65
+ if path == self.path:
66
+ return ""
67
+ prefix = self.path + self.fs.sep
68
+ assert path.startswith(prefix)
69
+ return path[len(prefix) :]
70
+ return [self._relpath(_path) for _path in path]
71
+
72
+ # Wrappers below
73
+
74
+ @property
75
+ def sep(self):
76
+ return self.fs.sep
77
+
78
+ async def set_session(self, *args, **kwargs):
79
+ return await self.fs.set_session(*args, **kwargs)
80
+
81
+ async def _rm_file(self, path, **kwargs):
82
+ return await self.fs._rm_file(self._join(path), **kwargs)
83
+
84
+ def rm_file(self, path, **kwargs):
85
+ return self.fs.rm_file(self._join(path), **kwargs)
86
+
87
+ async def _rm(self, path, *args, **kwargs):
88
+ return await self.fs._rm(self._join(path), *args, **kwargs)
89
+
90
+ def rm(self, path, *args, **kwargs):
91
+ return self.fs.rm(self._join(path), *args, **kwargs)
92
+
93
+ async def _cp_file(self, path1, path2, **kwargs):
94
+ return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
95
+
96
+ def cp_file(self, path1, path2, **kwargs):
97
+ return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
98
+
99
+ async def _copy(
100
+ self,
101
+ path1,
102
+ path2,
103
+ *args,
104
+ **kwargs,
105
+ ):
106
+ return await self.fs._copy(
107
+ self._join(path1),
108
+ self._join(path2),
109
+ *args,
110
+ **kwargs,
111
+ )
112
+
113
+ def copy(self, path1, path2, *args, **kwargs):
114
+ return self.fs.copy(
115
+ self._join(path1),
116
+ self._join(path2),
117
+ *args,
118
+ **kwargs,
119
+ )
120
+
121
+ async def _pipe(self, path, *args, **kwargs):
122
+ return await self.fs._pipe(self._join(path), *args, **kwargs)
123
+
124
+ def pipe(self, path, *args, **kwargs):
125
+ return self.fs.pipe(self._join(path), *args, **kwargs)
126
+
127
+ async def _pipe_file(self, path, *args, **kwargs):
128
+ return await self.fs._pipe_file(self._join(path), *args, **kwargs)
129
+
130
+ def pipe_file(self, path, *args, **kwargs):
131
+ return self.fs.pipe_file(self._join(path), *args, **kwargs)
132
+
133
+ async def _cat_file(self, path, *args, **kwargs):
134
+ return await self.fs._cat_file(self._join(path), *args, **kwargs)
135
+
136
+ def cat_file(self, path, *args, **kwargs):
137
+ return self.fs.cat_file(self._join(path), *args, **kwargs)
138
+
139
+ async def _cat(self, path, *args, **kwargs):
140
+ ret = await self.fs._cat(
141
+ self._join(path),
142
+ *args,
143
+ **kwargs,
144
+ )
145
+
146
+ if isinstance(ret, dict):
147
+ return {self._relpath(key): value for key, value in ret.items()}
148
+
149
+ return ret
150
+
151
+ def cat(self, path, *args, **kwargs):
152
+ ret = self.fs.cat(
153
+ self._join(path),
154
+ *args,
155
+ **kwargs,
156
+ )
157
+
158
+ if isinstance(ret, dict):
159
+ return {self._relpath(key): value for key, value in ret.items()}
160
+
161
+ return ret
162
+
163
+ async def _put_file(self, lpath, rpath, **kwargs):
164
+ return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
165
+
166
+ def put_file(self, lpath, rpath, **kwargs):
167
+ return self.fs.put_file(lpath, self._join(rpath), **kwargs)
168
+
169
+ async def _put(
170
+ self,
171
+ lpath,
172
+ rpath,
173
+ *args,
174
+ **kwargs,
175
+ ):
176
+ return await self.fs._put(
177
+ lpath,
178
+ self._join(rpath),
179
+ *args,
180
+ **kwargs,
181
+ )
182
+
183
+ def put(self, lpath, rpath, *args, **kwargs):
184
+ return self.fs.put(
185
+ lpath,
186
+ self._join(rpath),
187
+ *args,
188
+ **kwargs,
189
+ )
190
+
191
+ async def _get_file(self, rpath, lpath, **kwargs):
192
+ return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
193
+
194
+ def get_file(self, rpath, lpath, **kwargs):
195
+ return self.fs.get_file(self._join(rpath), lpath, **kwargs)
196
+
197
+ async def _get(self, rpath, *args, **kwargs):
198
+ return await self.fs._get(self._join(rpath), *args, **kwargs)
199
+
200
+ def get(self, rpath, *args, **kwargs):
201
+ return self.fs.get(self._join(rpath), *args, **kwargs)
202
+
203
+ async def _isfile(self, path):
204
+ return await self.fs._isfile(self._join(path))
205
+
206
+ def isfile(self, path):
207
+ return self.fs.isfile(self._join(path))
208
+
209
+ async def _isdir(self, path):
210
+ return await self.fs._isdir(self._join(path))
211
+
212
+ def isdir(self, path):
213
+ return self.fs.isdir(self._join(path))
214
+
215
+ async def _size(self, path):
216
+ return await self.fs._size(self._join(path))
217
+
218
+ def size(self, path):
219
+ return self.fs.size(self._join(path))
220
+
221
+ async def _exists(self, path):
222
+ return await self.fs._exists(self._join(path))
223
+
224
+ def exists(self, path):
225
+ return self.fs.exists(self._join(path))
226
+
227
+ async def _info(self, path, **kwargs):
228
+ return await self.fs._info(self._join(path), **kwargs)
229
+
230
+ def info(self, path, **kwargs):
231
+ return self.fs.info(self._join(path), **kwargs)
232
+
233
+ async def _ls(self, path, detail=True, **kwargs):
234
+ ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
235
+ if detail:
236
+ out = []
237
+ for entry in ret:
238
+ entry = entry.copy()
239
+ entry["name"] = self._relpath(entry["name"])
240
+ out.append(entry)
241
+ return out
242
+
243
+ return self._relpath(ret)
244
+
245
+ def ls(self, path, detail=True, **kwargs):
246
+ ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
247
+ if detail:
248
+ out = []
249
+ for entry in ret:
250
+ entry = entry.copy()
251
+ entry["name"] = self._relpath(entry["name"])
252
+ out.append(entry)
253
+ return out
254
+
255
+ return self._relpath(ret)
256
+
257
+ async def _walk(self, path, *args, **kwargs):
258
+ async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
259
+ yield self._relpath(root), dirs, files
260
+
261
+ def walk(self, path, *args, **kwargs):
262
+ for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
263
+ yield self._relpath(root), dirs, files
264
+
265
+ async def _glob(self, path, **kwargs):
266
+ detail = kwargs.get("detail", False)
267
+ ret = await self.fs._glob(self._join(path), **kwargs)
268
+ if detail:
269
+ return {self._relpath(path): info for path, info in ret.items()}
270
+ return self._relpath(ret)
271
+
272
+ def glob(self, path, **kwargs):
273
+ detail = kwargs.get("detail", False)
274
+ ret = self.fs.glob(self._join(path), **kwargs)
275
+ if detail:
276
+ return {self._relpath(path): info for path, info in ret.items()}
277
+ return self._relpath(ret)
278
+
279
+ async def _du(self, path, *args, **kwargs):
280
+ total = kwargs.get("total", True)
281
+ ret = await self.fs._du(self._join(path), *args, **kwargs)
282
+ if total:
283
+ return ret
284
+
285
+ return {self._relpath(path): size for path, size in ret.items()}
286
+
287
+ def du(self, path, *args, **kwargs):
288
+ total = kwargs.get("total", True)
289
+ ret = self.fs.du(self._join(path), *args, **kwargs)
290
+ if total:
291
+ return ret
292
+
293
+ return {self._relpath(path): size for path, size in ret.items()}
294
+
295
+ async def _find(self, path, *args, **kwargs):
296
+ detail = kwargs.get("detail", False)
297
+ ret = await self.fs._find(self._join(path), *args, **kwargs)
298
+ if detail:
299
+ return {self._relpath(path): info for path, info in ret.items()}
300
+ return self._relpath(ret)
301
+
302
+ def find(self, path, *args, **kwargs):
303
+ detail = kwargs.get("detail", False)
304
+ ret = self.fs.find(self._join(path), *args, **kwargs)
305
+ if detail:
306
+ return {self._relpath(path): info for path, info in ret.items()}
307
+ return self._relpath(ret)
308
+
309
+ async def _expand_path(self, path, *args, **kwargs):
310
+ return self._relpath(
311
+ await self.fs._expand_path(self._join(path), *args, **kwargs)
312
+ )
313
+
314
+ def expand_path(self, path, *args, **kwargs):
315
+ return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
316
+
317
+ async def _mkdir(self, path, *args, **kwargs):
318
+ return await self.fs._mkdir(self._join(path), *args, **kwargs)
319
+
320
+ def mkdir(self, path, *args, **kwargs):
321
+ return self.fs.mkdir(self._join(path), *args, **kwargs)
322
+
323
+ async def _makedirs(self, path, *args, **kwargs):
324
+ return await self.fs._makedirs(self._join(path), *args, **kwargs)
325
+
326
+ def makedirs(self, path, *args, **kwargs):
327
+ return self.fs.makedirs(self._join(path), *args, **kwargs)
328
+
329
+ def rmdir(self, path):
330
+ return self.fs.rmdir(self._join(path))
331
+
332
+ def mv(self, path1, path2, **kwargs):
333
+ return self.fs.mv(
334
+ self._join(path1),
335
+ self._join(path2),
336
+ **kwargs,
337
+ )
338
+
339
+ def touch(self, path, **kwargs):
340
+ return self.fs.touch(self._join(path), **kwargs)
341
+
342
+ def created(self, path):
343
+ return self.fs.created(self._join(path))
344
+
345
+ def modified(self, path):
346
+ return self.fs.modified(self._join(path))
347
+
348
+ def sign(self, path, *args, **kwargs):
349
+ return self.fs.sign(self._join(path), *args, **kwargs)
350
+
351
+ def __repr__(self):
352
+ return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
353
+
354
+ def open(
355
+ self,
356
+ path,
357
+ *args,
358
+ **kwargs,
359
+ ):
360
+ return self.fs.open(
361
+ self._join(path),
362
+ *args,
363
+ **kwargs,
364
+ )
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import uuid
4
+ import warnings
5
+ from ftplib import FTP, Error, error_perm
6
+ from typing import Any
7
+
8
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
9
+ from ..utils import infer_storage_options, isfilelike
10
+
11
+
12
+ class FTPFileSystem(AbstractFileSystem):
13
+ """A filesystem over classic FTP"""
14
+
15
+ root_marker = "/"
16
+ cachable = False
17
+ protocol = "ftp"
18
+
19
+ def __init__(
20
+ self,
21
+ host,
22
+ port=21,
23
+ username=None,
24
+ password=None,
25
+ acct=None,
26
+ block_size=None,
27
+ tempdir=None,
28
+ timeout=30,
29
+ encoding="utf-8",
30
+ **kwargs,
31
+ ):
32
+ """
33
+ You can use _get_kwargs_from_urls to get some kwargs from
34
+ a reasonable FTP url.
35
+
36
+ Authentication will be anonymous if username/password are not
37
+ given.
38
+
39
+ Parameters
40
+ ----------
41
+ host: str
42
+ The remote server name/ip to connect to
43
+ port: int
44
+ Port to connect with
45
+ username: str or None
46
+ If authenticating, the user's identifier
47
+ password: str of None
48
+ User's password on the server, if using
49
+ acct: str or None
50
+ Some servers also need an "account" string for auth
51
+ block_size: int or None
52
+ If given, the read-ahead or write buffer size.
53
+ tempdir: str
54
+ Directory on remote to put temporary files when in a transaction
55
+ timeout: int
56
+ Timeout of the ftp connection in seconds
57
+ encoding: str
58
+ Encoding to use for directories and filenames in FTP connection
59
+ """
60
+ super().__init__(**kwargs)
61
+ self.host = host
62
+ self.port = port
63
+ self.tempdir = tempdir or "/tmp"
64
+ self.cred = username, password, acct
65
+ self.timeout = timeout
66
+ self.encoding = encoding
67
+ if block_size is not None:
68
+ self.blocksize = block_size
69
+ else:
70
+ self.blocksize = 2**16
71
+ self._connect()
72
+
73
+ def _connect(self):
74
+ if sys.version_info >= (3, 9):
75
+ self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
76
+ elif self.encoding:
77
+ warnings.warn("`encoding` not supported for python<3.9, ignoring")
78
+ self.ftp = FTP(timeout=self.timeout)
79
+ else:
80
+ self.ftp = FTP(timeout=self.timeout)
81
+ self.ftp.connect(self.host, self.port)
82
+ self.ftp.login(*self.cred)
83
+
84
+ @classmethod
85
+ def _strip_protocol(cls, path):
86
+ return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
87
+
88
+ @staticmethod
89
+ def _get_kwargs_from_urls(urlpath):
90
+ out = infer_storage_options(urlpath)
91
+ out.pop("path", None)
92
+ out.pop("protocol", None)
93
+ return out
94
+
95
+ def ls(self, path, detail=True, **kwargs):
96
+ path = self._strip_protocol(path)
97
+ out = []
98
+ if path not in self.dircache:
99
+ try:
100
+ try:
101
+ out = [
102
+ (fn, details)
103
+ for (fn, details) in self.ftp.mlsd(path)
104
+ if fn not in [".", ".."]
105
+ and details["type"] not in ["pdir", "cdir"]
106
+ ]
107
+ except error_perm:
108
+ out = _mlsd2(self.ftp, path) # Not platform independent
109
+ for fn, details in out:
110
+ if path == "/":
111
+ path = "" # just for forming the names, below
112
+ details["name"] = "/".join([path, fn.lstrip("/")])
113
+ if details["type"] == "file":
114
+ details["size"] = int(details["size"])
115
+ else:
116
+ details["size"] = 0
117
+ if details["type"] == "dir":
118
+ details["type"] = "directory"
119
+ self.dircache[path] = out
120
+ except Error:
121
+ try:
122
+ info = self.info(path)
123
+ if info["type"] == "file":
124
+ out = [(path, info)]
125
+ except (Error, IndexError):
126
+ raise FileNotFoundError(path)
127
+ files = self.dircache.get(path, out)
128
+ if not detail:
129
+ return sorted([fn for fn, details in files])
130
+ return [details for fn, details in files]
131
+
132
+ def info(self, path, **kwargs):
133
+ # implement with direct method
134
+ path = self._strip_protocol(path)
135
+ if path == "/":
136
+ # special case, since this dir has no real entry
137
+ return {"name": "/", "size": 0, "type": "directory"}
138
+ files = self.ls(self._parent(path).lstrip("/"), True)
139
+ try:
140
+ out = [f for f in files if f["name"] == path][0]
141
+ except IndexError:
142
+ raise FileNotFoundError(path)
143
+ return out
144
+
145
+ def get_file(self, rpath, lpath, **kwargs):
146
+ if self.isdir(rpath):
147
+ if not os.path.exists(lpath):
148
+ os.mkdir(lpath)
149
+ return
150
+ if isfilelike(lpath):
151
+ outfile = lpath
152
+ else:
153
+ outfile = open(lpath, "wb")
154
+
155
+ def cb(x):
156
+ outfile.write(x)
157
+
158
+ self.ftp.retrbinary(
159
+ f"RETR {rpath}",
160
+ blocksize=self.blocksize,
161
+ callback=cb,
162
+ )
163
+ if not isfilelike(lpath):
164
+ outfile.close()
165
+
166
+ def cat_file(self, path, start=None, end=None, **kwargs):
167
+ if end is not None:
168
+ return super().cat_file(path, start, end, **kwargs)
169
+ out = []
170
+
171
+ def cb(x):
172
+ out.append(x)
173
+
174
+ try:
175
+ self.ftp.retrbinary(
176
+ f"RETR {path}",
177
+ blocksize=self.blocksize,
178
+ rest=start,
179
+ callback=cb,
180
+ )
181
+ except (Error, error_perm) as orig_exc:
182
+ raise FileNotFoundError(path) from orig_exc
183
+ return b"".join(out)
184
+
185
+ def _open(
186
+ self,
187
+ path,
188
+ mode="rb",
189
+ block_size=None,
190
+ cache_options=None,
191
+ autocommit=True,
192
+ **kwargs,
193
+ ):
194
+ path = self._strip_protocol(path)
195
+ block_size = block_size or self.blocksize
196
+ return FTPFile(
197
+ self,
198
+ path,
199
+ mode=mode,
200
+ block_size=block_size,
201
+ tempdir=self.tempdir,
202
+ autocommit=autocommit,
203
+ cache_options=cache_options,
204
+ )
205
+
206
+ def _rm(self, path):
207
+ path = self._strip_protocol(path)
208
+ self.ftp.delete(path)
209
+ self.invalidate_cache(self._parent(path))
210
+
211
+ def rm(self, path, recursive=False, maxdepth=None):
212
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
213
+ for p in reversed(paths):
214
+ if self.isfile(p):
215
+ self.rm_file(p)
216
+ else:
217
+ self.rmdir(p)
218
+
219
+ def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
220
+ path = self._strip_protocol(path)
221
+ parent = self._parent(path)
222
+ if parent != self.root_marker and not self.exists(parent) and create_parents:
223
+ self.mkdir(parent, create_parents=create_parents)
224
+
225
+ self.ftp.mkd(path)
226
+ self.invalidate_cache(self._parent(path))
227
+
228
+ def makedirs(self, path: str, exist_ok: bool = False) -> None:
229
+ path = self._strip_protocol(path)
230
+ if self.exists(path):
231
+ # NB: "/" does not "exist" as it has no directory entry
232
+ if not exist_ok:
233
+ raise FileExistsError(f"{path} exists without `exist_ok`")
234
+ # exists_ok=True -> no-op
235
+ else:
236
+ self.mkdir(path, create_parents=True)
237
+
238
+ def rmdir(self, path):
239
+ path = self._strip_protocol(path)
240
+ self.ftp.rmd(path)
241
+ self.invalidate_cache(self._parent(path))
242
+
243
+ def mv(self, path1, path2, **kwargs):
244
+ path1 = self._strip_protocol(path1)
245
+ path2 = self._strip_protocol(path2)
246
+ self.ftp.rename(path1, path2)
247
+ self.invalidate_cache(self._parent(path1))
248
+ self.invalidate_cache(self._parent(path2))
249
+
250
+ def __del__(self):
251
+ self.ftp.close()
252
+
253
+ def invalidate_cache(self, path=None):
254
+ if path is None:
255
+ self.dircache.clear()
256
+ else:
257
+ self.dircache.pop(path, None)
258
+ super().invalidate_cache(path)
259
+
260
+
261
+ class TransferDone(Exception):
262
+ """Internal exception to break out of transfer"""
263
+
264
+ pass
265
+
266
+
267
+ class FTPFile(AbstractBufferedFile):
268
+ """Interact with a remote FTP file with read/write buffering"""
269
+
270
+ def __init__(
271
+ self,
272
+ fs,
273
+ path,
274
+ mode="rb",
275
+ block_size="default",
276
+ autocommit=True,
277
+ cache_type="readahead",
278
+ cache_options=None,
279
+ **kwargs,
280
+ ):
281
+ super().__init__(
282
+ fs,
283
+ path,
284
+ mode=mode,
285
+ block_size=block_size,
286
+ autocommit=autocommit,
287
+ cache_type=cache_type,
288
+ cache_options=cache_options,
289
+ **kwargs,
290
+ )
291
+ if not autocommit:
292
+ self.target = self.path
293
+ self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
294
+
295
+ def commit(self):
296
+ self.fs.mv(self.path, self.target)
297
+
298
+ def discard(self):
299
+ self.fs.rm(self.path)
300
+
301
+ def _fetch_range(self, start, end):
302
+ """Get bytes between given byte limits
303
+
304
+ Implemented by raising an exception in the fetch callback when the
305
+ number of bytes received reaches the requested amount.
306
+
307
+ Will fail if the server does not respect the REST command on
308
+ retrieve requests.
309
+ """
310
+ out = []
311
+ total = [0]
312
+
313
+ def callback(x):
314
+ total[0] += len(x)
315
+ if total[0] > end - start:
316
+ out.append(x[: (end - start) - total[0]])
317
+ if end < self.size:
318
+ raise TransferDone
319
+ else:
320
+ out.append(x)
321
+
322
+ if total[0] == end - start and end < self.size:
323
+ raise TransferDone
324
+
325
+ try:
326
+ self.fs.ftp.retrbinary(
327
+ f"RETR {self.path}",
328
+ blocksize=self.blocksize,
329
+ rest=start,
330
+ callback=callback,
331
+ )
332
+ except TransferDone:
333
+ try:
334
+ # stop transfer, we got enough bytes for this block
335
+ self.fs.ftp.abort()
336
+ self.fs.ftp.getmultiline()
337
+ except Error:
338
+ self.fs._connect()
339
+
340
+ return b"".join(out)
341
+
342
+ def _upload_chunk(self, final=False):
343
+ self.buffer.seek(0)
344
+ self.fs.ftp.storbinary(
345
+ f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
346
+ )
347
+ return True
348
+
349
+
350
+ def _mlsd2(ftp, path="."):
351
+ """
352
+ Fall back to using `dir` instead of `mlsd` if not supported.
353
+
354
+ This parses a Linux style `ls -l` response to `dir`, but the response may
355
+ be platform dependent.
356
+
357
+ Parameters
358
+ ----------
359
+ ftp: ftplib.FTP
360
+ path: str
361
+ Expects to be given path, but defaults to ".".
362
+ """
363
+ lines = []
364
+ minfo = []
365
+ ftp.dir(path, lines.append)
366
+ for line in lines:
367
+ split_line = line.split()
368
+ if len(split_line) < 9:
369
+ continue
370
+ this = (
371
+ split_line[-1],
372
+ {
373
+ "modify": " ".join(split_line[5:8]),
374
+ "unix.owner": split_line[2],
375
+ "unix.group": split_line[3],
376
+ "unix.mode": split_line[0],
377
+ "size": split_line[4],
378
+ },
379
+ )
380
+ if "d" == this[1]["unix.mode"][0]:
381
+ this[1]["type"] = "dir"
382
+ else:
383
+ this[1]["type"] = "file"
384
+ minfo.append(this)
385
+ return minfo
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pygit2
4
+
5
+ from fsspec.spec import AbstractFileSystem
6
+
7
+ from .memory import MemoryFile
8
+
9
+
10
+ class GitFileSystem(AbstractFileSystem):
11
+ """Browse the files of a local git repo at any hash/tag/branch
12
+
13
+ (experimental backend)
14
+ """
15
+
16
+ root_marker = ""
17
+ cachable = True
18
+
19
+ def __init__(self, path=None, fo=None, ref=None, **kwargs):
20
+ """
21
+
22
+ Parameters
23
+ ----------
24
+ path: str (optional)
25
+ Local location of the repo (uses current directory if not given).
26
+ May be deprecated in favour of ``fo``. When used with a higher
27
+ level function such as fsspec.open(), may be of the form
28
+ "git://[path-to-repo[:]][ref@]path/to/file" (but the actual
29
+ file path should not contain "@" or ":").
30
+ fo: str (optional)
31
+ Same as ``path``, but passed as part of a chained URL. This one
32
+ takes precedence if both are given.
33
+ ref: str (optional)
34
+ Reference to work with, could be a hash, tag or branch name. Defaults
35
+ to current working tree. Note that ``ls`` and ``open`` also take hash,
36
+ so this becomes the default for those operations
37
+ kwargs
38
+ """
39
+ super().__init__(**kwargs)
40
+ self.repo = pygit2.Repository(fo or path or os.getcwd())
41
+ self.ref = ref or "master"
42
+
43
+ @classmethod
44
+ def _strip_protocol(cls, path):
45
+ path = super()._strip_protocol(path).lstrip("/")
46
+ if ":" in path:
47
+ path = path.split(":", 1)[1]
48
+ if "@" in path:
49
+ path = path.split("@", 1)[1]
50
+ return path.lstrip("/")
51
+
52
+ def _path_to_object(self, path, ref):
53
+ comm, ref = self.repo.resolve_refish(ref or self.ref)
54
+ parts = path.split("/")
55
+ tree = comm.tree
56
+ for part in parts:
57
+ if part and isinstance(tree, pygit2.Tree):
58
+ tree = tree[part]
59
+ return tree
60
+
61
+ @staticmethod
62
+ def _get_kwargs_from_urls(path):
63
+ if path.startswith("git://"):
64
+ path = path[6:]
65
+ out = {}
66
+ if ":" in path:
67
+ out["path"], path = path.split(":", 1)
68
+ if "@" in path:
69
+ out["ref"], path = path.split("@", 1)
70
+ return out
71
+
72
+ def ls(self, path, detail=True, ref=None, **kwargs):
73
+ path = self._strip_protocol(path)
74
+ tree = self._path_to_object(path, ref)
75
+ if isinstance(tree, pygit2.Tree):
76
+ out = []
77
+ for obj in tree:
78
+ if isinstance(obj, pygit2.Tree):
79
+ out.append(
80
+ {
81
+ "type": "directory",
82
+ "name": "/".join([path, obj.name]).lstrip("/"),
83
+ "hex": obj.hex,
84
+ "mode": f"{obj.filemode:o}",
85
+ "size": 0,
86
+ }
87
+ )
88
+ else:
89
+ out.append(
90
+ {
91
+ "type": "file",
92
+ "name": "/".join([path, obj.name]).lstrip("/"),
93
+ "hex": obj.hex,
94
+ "mode": f"{obj.filemode:o}",
95
+ "size": obj.size,
96
+ }
97
+ )
98
+ else:
99
+ obj = tree
100
+ out = [
101
+ {
102
+ "type": "file",
103
+ "name": obj.name,
104
+ "hex": obj.hex,
105
+ "mode": f"{obj.filemode:o}",
106
+ "size": obj.size,
107
+ }
108
+ ]
109
+ if detail:
110
+ return out
111
+ return [o["name"] for o in out]
112
+
113
+ def ukey(self, path, ref=None):
114
+ return self.info(path, ref=ref)["hex"]
115
+
116
+ def _open(
117
+ self,
118
+ path,
119
+ mode="rb",
120
+ block_size=None,
121
+ autocommit=True,
122
+ cache_options=None,
123
+ ref=None,
124
+ **kwargs,
125
+ ):
126
+ obj = self._path_to_object(path, ref or self.ref)
127
+ return MemoryFile(data=obj.data)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ from ..spec import AbstractFileSystem
4
+ from ..utils import infer_storage_options
5
+ from .memory import MemoryFile
6
+
7
+ # TODO: add GIST backend, would be very similar
8
+
9
+
10
+ class GithubFileSystem(AbstractFileSystem):
11
+ """Interface to files in github
12
+
13
+ An instance of this class provides the files residing within a remote github
14
+ repository. You may specify a point in the repos history, by SHA, branch
15
+ or tag (default is current master).
16
+
17
+ Given that code files tend to be small, and that github does not support
18
+ retrieving partial content, we always fetch whole files.
19
+
20
+ When using fsspec.open, allows URIs of the form:
21
+
22
+ - "github://path/file", in which case you must specify org, repo and
23
+ may specify sha in the extra args
24
+ - 'github://org:repo@/precip/catalog.yml', where the org and repo are
25
+ part of the URI
26
+ - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
27
+
28
+ ``sha`` can be the full or abbreviated hex of the commit you want to fetch
29
+ from, or a branch or tag name (so long as it doesn't contain special characters
30
+ like "/", "?", which would have to be HTTP-encoded).
31
+
32
+ For authorised access, you must provide username and token, which can be made
33
+ at https://github.com/settings/tokens
34
+ """
35
+
36
+ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
37
+ rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
38
+ protocol = "github"
39
+ timeout = (60, 60) # connect, read timeouts
40
+
41
+ def __init__(
42
+ self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
43
+ ):
44
+ super().__init__(**kwargs)
45
+ self.org = org
46
+ self.repo = repo
47
+ if (username is None) ^ (token is None):
48
+ raise ValueError("Auth required both username and token")
49
+ self.username = username
50
+ self.token = token
51
+ if timeout is not None:
52
+ self.timeout = timeout
53
+ if sha is None:
54
+ # look up default branch (not necessarily "master")
55
+ u = "https://api.github.com/repos/{org}/{repo}"
56
+ r = requests.get(
57
+ u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
58
+ )
59
+ r.raise_for_status()
60
+ sha = r.json()["default_branch"]
61
+
62
+ self.root = sha
63
+ self.ls("")
64
+
65
+ @property
66
+ def kw(self):
67
+ if self.username:
68
+ return {"auth": (self.username, self.token)}
69
+ return {}
70
+
71
+ @classmethod
72
+ def repos(cls, org_or_user, is_org=True):
73
+ """List repo names for given org or user
74
+
75
+ This may become the top level of the FS
76
+
77
+ Parameters
78
+ ----------
79
+ org_or_user: str
80
+ Name of the github org or user to query
81
+ is_org: bool (default True)
82
+ Whether the name is an organisation (True) or user (False)
83
+
84
+ Returns
85
+ -------
86
+ List of string
87
+ """
88
+ r = requests.get(
89
+ f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
90
+ timeout=cls.timeout,
91
+ )
92
+ r.raise_for_status()
93
+ return [repo["name"] for repo in r.json()]
94
+
95
+ @property
96
+ def tags(self):
97
+ """Names of tags in the repo"""
98
+ r = requests.get(
99
+ f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
100
+ timeout=self.timeout,
101
+ **self.kw,
102
+ )
103
+ r.raise_for_status()
104
+ return [t["name"] for t in r.json()]
105
+
106
+ @property
107
+ def branches(self):
108
+ """Names of branches in the repo"""
109
+ r = requests.get(
110
+ f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
111
+ timeout=self.timeout,
112
+ **self.kw,
113
+ )
114
+ r.raise_for_status()
115
+ return [t["name"] for t in r.json()]
116
+
117
+ @property
118
+ def refs(self):
119
+ """Named references, tags and branches"""
120
+ return {"tags": self.tags, "branches": self.branches}
121
+
122
+ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
123
+ """List files at given path
124
+
125
+ Parameters
126
+ ----------
127
+ path: str
128
+ Location to list, relative to repo root
129
+ detail: bool
130
+ If True, returns list of dicts, one per file; if False, returns
131
+ list of full filenames only
132
+ sha: str (optional)
133
+ List at the given point in the repo history, branch or tag name or commit
134
+ SHA
135
+ _sha: str (optional)
136
+ List this specific tree object (used internally to descend into trees)
137
+ """
138
+ path = self._strip_protocol(path)
139
+ if path == "":
140
+ _sha = sha or self.root
141
+ if _sha is None:
142
+ parts = path.rstrip("/").split("/")
143
+ so_far = ""
144
+ _sha = sha or self.root
145
+ for part in parts:
146
+ out = self.ls(so_far, True, sha=sha, _sha=_sha)
147
+ so_far += "/" + part if so_far else part
148
+ out = [o for o in out if o["name"] == so_far]
149
+ if not out:
150
+ raise FileNotFoundError(path)
151
+ out = out[0]
152
+ if out["type"] == "file":
153
+ if detail:
154
+ return [out]
155
+ else:
156
+ return path
157
+ _sha = out["sha"]
158
+ if path not in self.dircache or sha not in [self.root, None]:
159
+ r = requests.get(
160
+ self.url.format(org=self.org, repo=self.repo, sha=_sha),
161
+ timeout=self.timeout,
162
+ **self.kw,
163
+ )
164
+ if r.status_code == 404:
165
+ raise FileNotFoundError(path)
166
+ r.raise_for_status()
167
+ types = {"blob": "file", "tree": "directory"}
168
+ out = [
169
+ {
170
+ "name": path + "/" + f["path"] if path else f["path"],
171
+ "mode": f["mode"],
172
+ "type": types[f["type"]],
173
+ "size": f.get("size", 0),
174
+ "sha": f["sha"],
175
+ }
176
+ for f in r.json()["tree"]
177
+ if f["type"] in types
178
+ ]
179
+ if sha in [self.root, None]:
180
+ self.dircache[path] = out
181
+ else:
182
+ out = self.dircache[path]
183
+ if detail:
184
+ return out
185
+ else:
186
+ return sorted([f["name"] for f in out])
187
+
188
+ def invalidate_cache(self, path=None):
189
+ self.dircache.clear()
190
+
191
+ @classmethod
192
+ def _strip_protocol(cls, path):
193
+ opts = infer_storage_options(path)
194
+ if "username" not in opts:
195
+ return super()._strip_protocol(path)
196
+ return opts["path"].lstrip("/")
197
+
198
+ @staticmethod
199
+ def _get_kwargs_from_urls(path):
200
+ opts = infer_storage_options(path)
201
+ if "username" not in opts:
202
+ return {}
203
+ out = {"org": opts["username"], "repo": opts["password"]}
204
+ if opts["host"]:
205
+ out["sha"] = opts["host"]
206
+ return out
207
+
208
+ def _open(
209
+ self,
210
+ path,
211
+ mode="rb",
212
+ block_size=None,
213
+ autocommit=True,
214
+ cache_options=None,
215
+ sha=None,
216
+ **kwargs,
217
+ ):
218
+ if mode != "rb":
219
+ raise NotImplementedError
220
+ url = self.rurl.format(
221
+ org=self.org, repo=self.repo, path=path, sha=sha or self.root
222
+ )
223
+ r = requests.get(url, timeout=self.timeout, **self.kw)
224
+ if r.status_code == 404:
225
+ raise FileNotFoundError(path)
226
+ r.raise_for_status()
227
+ return MemoryFile(None, None, r.content)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import io
3
+ import logging
4
+ import re
5
+ import weakref
6
+ from copy import copy
7
+ from urllib.parse import urlparse
8
+
9
+ import aiohttp
10
+ import yarl
11
+
12
+ from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
13
+ from fsspec.callbacks import DEFAULT_CALLBACK
14
+ from fsspec.exceptions import FSTimeoutError
15
+ from fsspec.spec import AbstractBufferedFile
16
+ from fsspec.utils import (
17
+ DEFAULT_BLOCK_SIZE,
18
+ glob_translate,
19
+ isfilelike,
20
+ nullcontext,
21
+ tokenize,
22
+ )
23
+
24
+ from ..caching import AllBytes
25
+
26
+ # https://stackoverflow.com/a/15926317/3821154
27
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
28
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
29
+ logger = logging.getLogger("fsspec.http")
30
+
31
+
32
+ async def get_client(**kwargs):
33
+ return aiohttp.ClientSession(**kwargs)
34
+
35
+
36
+ class HTTPFileSystem(AsyncFileSystem):
37
+ """
38
+ Simple File-System for fetching data via HTTP(S)
39
+
40
+ ``ls()`` is implemented by loading the parent page and doing a regex
41
+ match on the result. If simple_link=True, anything of the form
42
+ "http(s)://server.com/stuff?thing=other"; otherwise only links within
43
+ HTML href tags will be used.
44
+ """
45
+
46
+ sep = "/"
47
+
48
+ def __init__(
49
+ self,
50
+ simple_links=True,
51
+ block_size=None,
52
+ same_scheme=True,
53
+ size_policy=None,
54
+ cache_type="bytes",
55
+ cache_options=None,
56
+ asynchronous=False,
57
+ loop=None,
58
+ client_kwargs=None,
59
+ get_client=get_client,
60
+ encoded=False,
61
+ **storage_options,
62
+ ):
63
+ """
64
+ NB: if this is called async, you must await set_client
65
+
66
+ Parameters
67
+ ----------
68
+ block_size: int
69
+ Blocks to read bytes; if 0, will default to raw requests file-like
70
+ objects instead of HTTPFile instances
71
+ simple_links: bool
72
+ If True, will consider both HTML <a> tags and anything that looks
73
+ like a URL; if False, will consider only the former.
74
+ same_scheme: True
75
+ When doing ls/glob, if this is True, only consider paths that have
76
+ http/https matching the input URLs.
77
+ size_policy: this argument is deprecated
78
+ client_kwargs: dict
79
+ Passed to aiohttp.ClientSession, see
80
+ https://docs.aiohttp.org/en/stable/client_reference.html
81
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
82
+ get_client: Callable[..., aiohttp.ClientSession]
83
+ A callable which takes keyword arguments and constructs
84
+ an aiohttp.ClientSession. It's state will be managed by
85
+ the HTTPFileSystem class.
86
+ storage_options: key-value
87
+ Any other parameters passed on to requests
88
+ cache_type, cache_options: defaults used in open
89
+ """
90
+ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
91
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
92
+ self.simple_links = simple_links
93
+ self.same_schema = same_scheme
94
+ self.cache_type = cache_type
95
+ self.cache_options = cache_options
96
+ self.client_kwargs = client_kwargs or {}
97
+ self.get_client = get_client
98
+ self.encoded = encoded
99
+ self.kwargs = storage_options
100
+ self._session = None
101
+
102
+ # Clean caching-related parameters from `storage_options`
103
+ # before propagating them as `request_options` through `self.kwargs`.
104
+ # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
105
+ # it clearer.
106
+ request_options = copy(storage_options)
107
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
108
+ request_options.pop("listings_expiry_time", None)
109
+ request_options.pop("max_paths", None)
110
+ request_options.pop("skip_instance_cache", None)
111
+ self.kwargs = request_options
112
+
113
+ @property
114
+ def fsid(self):
115
+ return "http"
116
+
117
+ def encode_url(self, url):
118
+ return yarl.URL(url, encoded=self.encoded)
119
+
120
+ @staticmethod
121
+ def close_session(loop, session):
122
+ if loop is not None and loop.is_running():
123
+ try:
124
+ sync(loop, session.close, timeout=0.1)
125
+ return
126
+ except (TimeoutError, FSTimeoutError, NotImplementedError):
127
+ pass
128
+ connector = getattr(session, "_connector", None)
129
+ if connector is not None:
130
+ # close after loop is dead
131
+ connector._close()
132
+
133
+ async def set_session(self):
134
+ if self._session is None:
135
+ self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
136
+ if not self.asynchronous:
137
+ weakref.finalize(self, self.close_session, self.loop, self._session)
138
+ return self._session
139
+
140
+ @classmethod
141
+ def _strip_protocol(cls, path):
142
+ """For HTTP, we always want to keep the full URL"""
143
+ return path
144
+
145
+ @classmethod
146
+ def _parent(cls, path):
147
+ # override, since _strip_protocol is different for URLs
148
+ par = super()._parent(path)
149
+ if len(par) > 7: # "http://..."
150
+ return par
151
+ return ""
152
+
153
+ async def _ls_real(self, url, detail=True, **kwargs):
154
+ # ignoring URL-encoded arguments
155
+ kw = self.kwargs.copy()
156
+ kw.update(kwargs)
157
+ logger.debug(url)
158
+ session = await self.set_session()
159
+ async with session.get(self.encode_url(url), **self.kwargs) as r:
160
+ self._raise_not_found_for_status(r, url)
161
+ try:
162
+ text = await r.text()
163
+ if self.simple_links:
164
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
165
+ else:
166
+ links = [u[2] for u in ex.findall(text)]
167
+ except UnicodeDecodeError:
168
+ links = [] # binary, not HTML
169
+ out = set()
170
+ parts = urlparse(url)
171
+ for l in links:
172
+ if isinstance(l, tuple):
173
+ l = l[1]
174
+ if l.startswith("/") and len(l) > 1:
175
+ # absolute URL on this server
176
+ l = f"{parts.scheme}://{parts.netloc}{l}"
177
+ if l.startswith("http"):
178
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
179
+ out.add(l)
180
+ elif l.replace("https", "http").startswith(
181
+ url.replace("https", "http").rstrip("/") + "/"
182
+ ):
183
+ # allowed to cross http <-> https
184
+ out.add(l)
185
+ else:
186
+ if l not in ["..", "../"]:
187
+ # Ignore FTP-like "parent"
188
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
189
+ if not out and url.endswith("/"):
190
+ out = await self._ls_real(url.rstrip("/"), detail=False)
191
+ if detail:
192
+ return [
193
+ {
194
+ "name": u,
195
+ "size": None,
196
+ "type": "directory" if u.endswith("/") else "file",
197
+ }
198
+ for u in out
199
+ ]
200
+ else:
201
+ return sorted(out)
202
+
203
+ async def _ls(self, url, detail=True, **kwargs):
204
+ if self.use_listings_cache and url in self.dircache:
205
+ out = self.dircache[url]
206
+ else:
207
+ out = await self._ls_real(url, detail=detail, **kwargs)
208
+ self.dircache[url] = out
209
+ return out
210
+
211
+ ls = sync_wrapper(_ls)
212
+
213
+ def _raise_not_found_for_status(self, response, url):
214
+ """
215
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
216
+ """
217
+ if response.status == 404:
218
+ raise FileNotFoundError(url)
219
+ response.raise_for_status()
220
+
221
+ async def _cat_file(self, url, start=None, end=None, **kwargs):
222
+ kw = self.kwargs.copy()
223
+ kw.update(kwargs)
224
+ logger.debug(url)
225
+
226
+ if start is not None or end is not None:
227
+ if start == end:
228
+ return b""
229
+ headers = kw.pop("headers", {}).copy()
230
+
231
+ headers["Range"] = await self._process_limits(url, start, end)
232
+ kw["headers"] = headers
233
+ session = await self.set_session()
234
+ async with session.get(self.encode_url(url), **kw) as r:
235
+ out = await r.read()
236
+ self._raise_not_found_for_status(r, url)
237
+ return out
238
+
239
+ async def _get_file(
240
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
241
+ ):
242
+ kw = self.kwargs.copy()
243
+ kw.update(kwargs)
244
+ logger.debug(rpath)
245
+ session = await self.set_session()
246
+ async with session.get(self.encode_url(rpath), **kw) as r:
247
+ try:
248
+ size = int(r.headers["content-length"])
249
+ except (ValueError, KeyError):
250
+ size = None
251
+
252
+ callback.set_size(size)
253
+ self._raise_not_found_for_status(r, rpath)
254
+ if isfilelike(lpath):
255
+ outfile = lpath
256
+ else:
257
+ outfile = open(lpath, "wb") # noqa: ASYNC101
258
+
259
+ try:
260
+ chunk = True
261
+ while chunk:
262
+ chunk = await r.content.read(chunk_size)
263
+ outfile.write(chunk)
264
+ callback.relative_update(len(chunk))
265
+ finally:
266
+ if not isfilelike(lpath):
267
+ outfile.close()
268
+
269
+ async def _put_file(
270
+ self,
271
+ lpath,
272
+ rpath,
273
+ chunk_size=5 * 2**20,
274
+ callback=DEFAULT_CALLBACK,
275
+ method="post",
276
+ **kwargs,
277
+ ):
278
+ async def gen_chunks():
279
+ # Support passing arbitrary file-like objects
280
+ # and use them instead of streams.
281
+ if isinstance(lpath, io.IOBase):
282
+ context = nullcontext(lpath)
283
+ use_seek = False # might not support seeking
284
+ else:
285
+ context = open(lpath, "rb") # noqa: ASYNC101
286
+ use_seek = True
287
+
288
+ with context as f:
289
+ if use_seek:
290
+ callback.set_size(f.seek(0, 2))
291
+ f.seek(0)
292
+ else:
293
+ callback.set_size(getattr(f, "size", None))
294
+
295
+ chunk = f.read(chunk_size)
296
+ while chunk:
297
+ yield chunk
298
+ callback.relative_update(len(chunk))
299
+ chunk = f.read(chunk_size)
300
+
301
+ kw = self.kwargs.copy()
302
+ kw.update(kwargs)
303
+ session = await self.set_session()
304
+
305
+ method = method.lower()
306
+ if method not in ("post", "put"):
307
+ raise ValueError(
308
+ f"method has to be either 'post' or 'put', not: {method!r}"
309
+ )
310
+
311
+ meth = getattr(session, method)
312
+ async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
313
+ self._raise_not_found_for_status(resp, rpath)
314
+
315
+ async def _exists(self, path, **kwargs):
316
+ kw = self.kwargs.copy()
317
+ kw.update(kwargs)
318
+ try:
319
+ logger.debug(path)
320
+ session = await self.set_session()
321
+ r = await session.get(self.encode_url(path), **kw)
322
+ async with r:
323
+ return r.status < 400
324
+ except aiohttp.ClientError:
325
+ return False
326
+
327
+ async def _isfile(self, path, **kwargs):
328
+ return await self._exists(path, **kwargs)
329
+
330
+ def _open(
331
+ self,
332
+ path,
333
+ mode="rb",
334
+ block_size=None,
335
+ autocommit=None, # XXX: This differs from the base class.
336
+ cache_type=None,
337
+ cache_options=None,
338
+ size=None,
339
+ **kwargs,
340
+ ):
341
+ """Make a file-like object
342
+
343
+ Parameters
344
+ ----------
345
+ path: str
346
+ Full URL with protocol
347
+ mode: string
348
+ must be "rb"
349
+ block_size: int or None
350
+ Bytes to download in one request; use instance value if None. If
351
+ zero, will return a streaming Requests file-like instance.
352
+ kwargs: key-value
353
+ Any other parameters, passed to requests calls
354
+ """
355
+ if mode != "rb":
356
+ raise NotImplementedError
357
+ block_size = block_size if block_size is not None else self.block_size
358
+ kw = self.kwargs.copy()
359
+ kw["asynchronous"] = self.asynchronous
360
+ kw.update(kwargs)
361
+ size = size or self.info(path, **kwargs)["size"]
362
+ session = sync(self.loop, self.set_session)
363
+ if block_size and size:
364
+ return HTTPFile(
365
+ self,
366
+ path,
367
+ session=session,
368
+ block_size=block_size,
369
+ mode=mode,
370
+ size=size,
371
+ cache_type=cache_type or self.cache_type,
372
+ cache_options=cache_options or self.cache_options,
373
+ loop=self.loop,
374
+ **kw,
375
+ )
376
+ else:
377
+ return HTTPStreamFile(
378
+ self,
379
+ path,
380
+ mode=mode,
381
+ loop=self.loop,
382
+ session=session,
383
+ **kw,
384
+ )
385
+
386
+ async def open_async(self, path, mode="rb", size=None, **kwargs):
387
+ session = await self.set_session()
388
+ if size is None:
389
+ try:
390
+ size = (await self._info(path, **kwargs))["size"]
391
+ except FileNotFoundError:
392
+ pass
393
+ return AsyncStreamFile(
394
+ self,
395
+ path,
396
+ loop=self.loop,
397
+ session=session,
398
+ size=size,
399
+ **kwargs,
400
+ )
401
+
402
+ def ukey(self, url):
403
+ """Unique identifier; assume HTTP files are static, unchanging"""
404
+ return tokenize(url, self.kwargs, self.protocol)
405
+
406
+ async def _info(self, url, **kwargs):
407
+ """Get info of URL
408
+
409
+ Tries to access location via HEAD, and then GET methods, but does
410
+ not fetch the data.
411
+
412
+ It is possible that the server does not supply any size information, in
413
+ which case size will be given as None (and certain operations on the
414
+ corresponding file will not work).
415
+ """
416
+ info = {}
417
+ session = await self.set_session()
418
+
419
+ for policy in ["head", "get"]:
420
+ try:
421
+ info.update(
422
+ await _file_info(
423
+ self.encode_url(url),
424
+ size_policy=policy,
425
+ session=session,
426
+ **self.kwargs,
427
+ **kwargs,
428
+ )
429
+ )
430
+ if info.get("size") is not None:
431
+ break
432
+ except Exception as exc:
433
+ if policy == "get":
434
+ # If get failed, then raise a FileNotFoundError
435
+ raise FileNotFoundError(url) from exc
436
+ logger.debug("", exc_info=exc)
437
+
438
+ return {"name": url, "size": None, **info, "type": "file"}
439
+
440
+ async def _glob(self, path, maxdepth=None, **kwargs):
441
+ """
442
+ Find files by glob-matching.
443
+
444
+ This implementation is idntical to the one in AbstractFileSystem,
445
+ but "?" is not considered as a character for globbing, because it is
446
+ so common in URLs, often identifying the "query" part.
447
+ """
448
+ if maxdepth is not None and maxdepth < 1:
449
+ raise ValueError("maxdepth must be at least 1")
450
+ import re
451
+
452
+ ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
453
+ path = self._strip_protocol(path)
454
+ append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
455
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
456
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
457
+
458
+ min_idx = min(idx_star, idx_brace)
459
+
460
+ detail = kwargs.pop("detail", False)
461
+
462
+ if not has_magic(path):
463
+ if await self._exists(path, **kwargs):
464
+ if not detail:
465
+ return [path]
466
+ else:
467
+ return {path: await self._info(path, **kwargs)}
468
+ else:
469
+ if not detail:
470
+ return [] # glob of non-existent returns empty
471
+ else:
472
+ return {}
473
+ elif "/" in path[:min_idx]:
474
+ min_idx = path[:min_idx].rindex("/")
475
+ root = path[: min_idx + 1]
476
+ depth = path[min_idx + 1 :].count("/") + 1
477
+ else:
478
+ root = ""
479
+ depth = path[min_idx + 1 :].count("/") + 1
480
+
481
+ if "**" in path:
482
+ if maxdepth is not None:
483
+ idx_double_stars = path.find("**")
484
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
485
+ depth = depth - depth_double_stars + maxdepth
486
+ else:
487
+ depth = None
488
+
489
+ allpaths = await self._find(
490
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
491
+ )
492
+
493
+ pattern = glob_translate(path + ("/" if ends_with_slash else ""))
494
+ pattern = re.compile(pattern)
495
+
496
+ out = {
497
+ (
498
+ p.rstrip("/")
499
+ if not append_slash_to_dirname
500
+ and info["type"] == "directory"
501
+ and p.endswith("/")
502
+ else p
503
+ ): info
504
+ for p, info in sorted(allpaths.items())
505
+ if pattern.match(p.rstrip("/"))
506
+ }
507
+
508
+ if detail:
509
+ return out
510
+ else:
511
+ return list(out)
512
+
513
+ async def _isdir(self, path):
514
+ # override, since all URLs are (also) files
515
+ try:
516
+ return bool(await self._ls(path))
517
+ except (FileNotFoundError, ValueError):
518
+ return False
519
+
520
+
521
+ class HTTPFile(AbstractBufferedFile):
522
+ """
523
+ A file-like object pointing to a remove HTTP(S) resource
524
+
525
+ Supports only reading, with read-ahead of a predermined block-size.
526
+
527
+ In the case that the server does not supply the filesize, only reading of
528
+ the complete file in one go is supported.
529
+
530
+ Parameters
531
+ ----------
532
+ url: str
533
+ Full URL of the remote resource, including the protocol
534
+ session: aiohttp.ClientSession or None
535
+ All calls will be made within this session, to avoid restarting
536
+ connections where the server allows this
537
+ block_size: int or None
538
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
539
+ configured for the FileSystem creating this file
540
+ size: None or int
541
+ If given, this is the size of the file in bytes, and we don't attempt
542
+ to call the server to find the value.
543
+ kwargs: all other key-values are passed to requests calls.
544
+ """
545
+
546
+ def __init__(
547
+ self,
548
+ fs,
549
+ url,
550
+ session=None,
551
+ block_size=None,
552
+ mode="rb",
553
+ cache_type="bytes",
554
+ cache_options=None,
555
+ size=None,
556
+ loop=None,
557
+ asynchronous=False,
558
+ **kwargs,
559
+ ):
560
+ if mode != "rb":
561
+ raise NotImplementedError("File mode not supported")
562
+ self.asynchronous = asynchronous
563
+ self.url = url
564
+ self.session = session
565
+ self.details = {"name": url, "size": size, "type": "file"}
566
+ super().__init__(
567
+ fs=fs,
568
+ path=url,
569
+ mode=mode,
570
+ block_size=block_size,
571
+ cache_type=cache_type,
572
+ cache_options=cache_options,
573
+ **kwargs,
574
+ )
575
+ self.loop = loop
576
+
577
+ def read(self, length=-1):
578
+ """Read bytes from file
579
+
580
+ Parameters
581
+ ----------
582
+ length: int
583
+ Read up to this many bytes. If negative, read all content to end of
584
+ file. If the server has not supplied the filesize, attempting to
585
+ read only part of the data will raise a ValueError.
586
+ """
587
+ if (
588
+ (length < 0 and self.loc == 0) # explicit read all
589
+ # but not when the size is known and fits into a block anyways
590
+ and not (self.size is not None and self.size <= self.blocksize)
591
+ ):
592
+ self._fetch_all()
593
+ if self.size is None:
594
+ if length < 0:
595
+ self._fetch_all()
596
+ else:
597
+ length = min(self.size - self.loc, length)
598
+ return super().read(length)
599
+
600
+ async def async_fetch_all(self):
601
+ """Read whole file in one shot, without caching
602
+
603
+ This is only called when position is still at zero,
604
+ and read() is called without a byte-count.
605
+ """
606
+ logger.debug(f"Fetch all for {self}")
607
+ if not isinstance(self.cache, AllBytes):
608
+ r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
609
+ async with r:
610
+ r.raise_for_status()
611
+ out = await r.read()
612
+ self.cache = AllBytes(
613
+ size=len(out), fetcher=None, blocksize=None, data=out
614
+ )
615
+ self.size = len(out)
616
+
617
+ _fetch_all = sync_wrapper(async_fetch_all)
618
+
619
+ def _parse_content_range(self, headers):
620
+ """Parse the Content-Range header"""
621
+ s = headers.get("Content-Range", "")
622
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
623
+ if not m:
624
+ return None, None, None
625
+
626
+ if m[1] == "*":
627
+ start = end = None
628
+ else:
629
+ start, end = [int(x) for x in m[1].split("-")]
630
+ total = None if m[2] == "*" else int(m[2])
631
+ return start, end, total
632
+
633
+ async def async_fetch_range(self, start, end):
634
+ """Download a block of data
635
+
636
+ The expectation is that the server returns only the requested bytes,
637
+ with HTTP code 206. If this is not the case, we first check the headers,
638
+ and then stream the output - if the data size is bigger than we
639
+ requested, an exception is raised.
640
+ """
641
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
642
+ kwargs = self.kwargs.copy()
643
+ headers = kwargs.pop("headers", {}).copy()
644
+ headers["Range"] = f"bytes={start}-{end - 1}"
645
+ logger.debug(f"{self.url} : {headers['Range']}")
646
+ r = await self.session.get(
647
+ self.fs.encode_url(self.url), headers=headers, **kwargs
648
+ )
649
+ async with r:
650
+ if r.status == 416:
651
+ # range request outside file
652
+ return b""
653
+ r.raise_for_status()
654
+
655
+ # If the server has handled the range request, it should reply
656
+ # with status 206 (partial content). But we'll guess that a suitable
657
+ # Content-Range header or a Content-Length no more than the
658
+ # requested range also mean we have got the desired range.
659
+ response_is_range = (
660
+ r.status == 206
661
+ or self._parse_content_range(r.headers)[0] == start
662
+ or int(r.headers.get("Content-Length", end + 1)) <= end - start
663
+ )
664
+
665
+ if response_is_range:
666
+ # partial content, as expected
667
+ out = await r.read()
668
+ elif start > 0:
669
+ raise ValueError(
670
+ "The HTTP server doesn't appear to support range requests. "
671
+ "Only reading this file from the beginning is supported. "
672
+ "Open with block_size=0 for a streaming file interface."
673
+ )
674
+ else:
675
+ # Response is not a range, but we want the start of the file,
676
+ # so we can read the required amount anyway.
677
+ cl = 0
678
+ out = []
679
+ while True:
680
+ chunk = await r.content.read(2**20)
681
+ # data size unknown, let's read until we have enough
682
+ if chunk:
683
+ out.append(chunk)
684
+ cl += len(chunk)
685
+ if cl > end - start:
686
+ break
687
+ else:
688
+ break
689
+ out = b"".join(out)[: end - start]
690
+ return out
691
+
692
+ _fetch_range = sync_wrapper(async_fetch_range)
693
+
694
+ def __reduce__(self):
695
+ return (
696
+ reopen,
697
+ (
698
+ self.fs,
699
+ self.url,
700
+ self.mode,
701
+ self.blocksize,
702
+ self.cache.name if self.cache else "none",
703
+ self.size,
704
+ ),
705
+ )
706
+
707
+
708
+ def reopen(fs, url, mode, blocksize, cache_type, size=None):
709
+ return fs.open(
710
+ url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
711
+ )
712
+
713
+
714
+ magic_check = re.compile("([*[])")
715
+
716
+
717
+ def has_magic(s):
718
+ match = magic_check.search(s)
719
+ return match is not None
720
+
721
+
722
+ class HTTPStreamFile(AbstractBufferedFile):
723
+ def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
724
+ self.asynchronous = kwargs.pop("asynchronous", False)
725
+ self.url = url
726
+ self.loop = loop
727
+ self.session = session
728
+ if mode != "rb":
729
+ raise ValueError
730
+ self.details = {"name": url, "size": None}
731
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
732
+
733
+ async def cor():
734
+ r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
735
+ self.fs._raise_not_found_for_status(r, url)
736
+ return r
737
+
738
+ self.r = sync(self.loop, cor)
739
+
740
+ def seek(self, loc, whence=0):
741
+ if loc == 0 and whence == 1:
742
+ return
743
+ if loc == self.loc and whence == 0:
744
+ return
745
+ raise ValueError("Cannot seek streaming HTTP file")
746
+
747
+ async def _read(self, num=-1):
748
+ out = await self.r.content.read(num)
749
+ self.loc += len(out)
750
+ return out
751
+
752
+ read = sync_wrapper(_read)
753
+
754
+ async def _close(self):
755
+ self.r.close()
756
+
757
+ def close(self):
758
+ asyncio.run_coroutine_threadsafe(self._close(), self.loop)
759
+ super().close()
760
+
761
+ def __reduce__(self):
762
+ return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
763
+
764
+
765
+ class AsyncStreamFile(AbstractAsyncStreamedFile):
766
+ def __init__(
767
+ self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
768
+ ):
769
+ self.url = url
770
+ self.session = session
771
+ self.r = None
772
+ if mode != "rb":
773
+ raise ValueError
774
+ self.details = {"name": url, "size": None}
775
+ self.kwargs = kwargs
776
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
777
+ self.size = size
778
+
779
+ async def read(self, num=-1):
780
+ if self.r is None:
781
+ r = await self.session.get(
782
+ self.fs.encode_url(self.url), **self.kwargs
783
+ ).__aenter__()
784
+ self.fs._raise_not_found_for_status(r, self.url)
785
+ self.r = r
786
+ out = await self.r.content.read(num)
787
+ self.loc += len(out)
788
+ return out
789
+
790
+ async def close(self):
791
+ if self.r is not None:
792
+ self.r.close()
793
+ self.r = None
794
+ await super().close()
795
+
796
+
797
+ async def get_range(session, url, start, end, file=None, **kwargs):
798
+ # explicit get a range when we know it must be safe
799
+ kwargs = kwargs.copy()
800
+ headers = kwargs.pop("headers", {}).copy()
801
+ headers["Range"] = f"bytes={start}-{end - 1}"
802
+ r = await session.get(url, headers=headers, **kwargs)
803
+ r.raise_for_status()
804
+ async with r:
805
+ out = await r.read()
806
+ if file:
807
+ with open(file, "r+b") as f: # noqa: ASYNC101
808
+ f.seek(start)
809
+ f.write(out)
810
+ else:
811
+ return out
812
+
813
+
814
+ async def _file_info(url, session, size_policy="head", **kwargs):
815
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
816
+
817
+ Default operation is to explicitly allow redirects and use encoding
818
+ 'identity' (no compression) to get the true size of the target.
819
+ """
820
+ logger.debug("Retrieve file size for %s", url)
821
+ kwargs = kwargs.copy()
822
+ ar = kwargs.pop("allow_redirects", True)
823
+ head = kwargs.get("headers", {}).copy()
824
+ head["Accept-Encoding"] = "identity"
825
+ kwargs["headers"] = head
826
+
827
+ info = {}
828
+ if size_policy == "head":
829
+ r = await session.head(url, allow_redirects=ar, **kwargs)
830
+ elif size_policy == "get":
831
+ r = await session.get(url, allow_redirects=ar, **kwargs)
832
+ else:
833
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
834
+ async with r:
835
+ r.raise_for_status()
836
+
837
+ # TODO:
838
+ # recognise lack of 'Accept-Ranges',
839
+ # or 'Accept-Ranges': 'none' (not 'bytes')
840
+ # to mean streaming only, no random access => return None
841
+ if "Content-Length" in r.headers:
842
+ # Some servers may choose to ignore Accept-Encoding and return
843
+ # compressed content, in which case the returned size is unreliable.
844
+ if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
845
+ "identity",
846
+ "",
847
+ ]:
848
+ info["size"] = int(r.headers["Content-Length"])
849
+ elif "Content-Range" in r.headers:
850
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
851
+
852
+ if "Content-Type" in r.headers:
853
+ info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
854
+
855
+ info["url"] = str(r.url)
856
+
857
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
858
+ if r.headers.get(checksum_field):
859
+ info[checksum_field] = r.headers[checksum_field]
860
+
861
+ return info
862
+
863
+
864
+ async def _file_size(url, session=None, *args, **kwargs):
865
+ if session is None:
866
+ session = await get_client()
867
+ info = await _file_info(url, session=session, *args, **kwargs)
868
+ return info.get("size")
869
+
870
+
871
+ file_size = sync_wrapper(_file_size)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import re
4
+
5
+ import requests
6
+
7
+ import fsspec
8
+
9
+
10
+ class JupyterFileSystem(fsspec.AbstractFileSystem):
11
+ """View of the files as seen by a Jupyter server (notebook or lab)"""
12
+
13
+ protocol = ("jupyter", "jlab")
14
+
15
+ def __init__(self, url, tok=None, **kwargs):
16
+ """
17
+
18
+ Parameters
19
+ ----------
20
+ url : str
21
+ Base URL of the server, like "http://127.0.0.1:8888". May include
22
+ token in the string, which is given by the process when starting up
23
+ tok : str
24
+ If the token is obtained separately, can be given here
25
+ kwargs
26
+ """
27
+ if "?" in url:
28
+ if tok is None:
29
+ try:
30
+ tok = re.findall("token=([a-z0-9]+)", url)[0]
31
+ except IndexError as e:
32
+ raise ValueError("Could not determine token") from e
33
+ url = url.split("?", 1)[0]
34
+ self.url = url.rstrip("/") + "/api/contents"
35
+ self.session = requests.Session()
36
+ if tok:
37
+ self.session.headers["Authorization"] = f"token {tok}"
38
+
39
+ super().__init__(**kwargs)
40
+
41
+ def ls(self, path, detail=True, **kwargs):
42
+ path = self._strip_protocol(path)
43
+ r = self.session.get(f"{self.url}/{path}")
44
+ if r.status_code == 404:
45
+ return FileNotFoundError(path)
46
+ r.raise_for_status()
47
+ out = r.json()
48
+
49
+ if out["type"] == "directory":
50
+ out = out["content"]
51
+ else:
52
+ out = [out]
53
+ for o in out:
54
+ o["name"] = o.pop("path")
55
+ o.pop("content")
56
+ if o["type"] == "notebook":
57
+ o["type"] = "file"
58
+ if detail:
59
+ return out
60
+ return [o["name"] for o in out]
61
+
62
+ def cat_file(self, path, start=None, end=None, **kwargs):
63
+ path = self._strip_protocol(path)
64
+ r = self.session.get(f"{self.url}/{path}")
65
+ if r.status_code == 404:
66
+ return FileNotFoundError(path)
67
+ r.raise_for_status()
68
+ out = r.json()
69
+ if out["format"] == "text":
70
+ # data should be binary
71
+ b = out["content"].encode()
72
+ else:
73
+ b = base64.b64decode(out["content"])
74
+ return b[start:end]
75
+
76
+ def pipe_file(self, path, value, **_):
77
+ path = self._strip_protocol(path)
78
+ json = {
79
+ "name": path.rsplit("/", 1)[-1],
80
+ "path": path,
81
+ "size": len(value),
82
+ "content": base64.b64encode(value).decode(),
83
+ "format": "base64",
84
+ "type": "file",
85
+ }
86
+ self.session.put(f"{self.url}/{path}", json=json)
87
+
88
+ def mkdir(self, path, create_parents=True, **kwargs):
89
+ path = self._strip_protocol(path)
90
+ if create_parents and "/" in path:
91
+ self.mkdir(path.rsplit("/", 1)[0], True)
92
+ json = {
93
+ "name": path.rsplit("/", 1)[-1],
94
+ "path": path,
95
+ "size": None,
96
+ "content": None,
97
+ "type": "directory",
98
+ }
99
+ self.session.put(f"{self.url}/{path}", json=json)
100
+
101
+ def _rm(self, path):
102
+ path = self._strip_protocol(path)
103
+ self.session.delete(f"{self.url}/{path}")
104
+
105
+ def _open(self, path, mode="rb", **kwargs):
106
+ path = self._strip_protocol(path)
107
+ if mode == "rb":
108
+ data = self.cat_file(path)
109
+ return io.BytesIO(data)
110
+ else:
111
+ return SimpleFileWriter(self, path, mode="wb")
112
+
113
+
114
+ class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
115
+ def _upload_chunk(self, final=False):
116
+ """Never uploads a chunk until file is done
117
+
118
+ Not suitable for large files
119
+ """
120
+ if final is False:
121
+ return False
122
+ self.buffer.seek(0)
123
+ data = self.buffer.read()
124
+ self.fs.pipe_file(self.path, data)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+ from ctypes import (
3
+ CFUNCTYPE,
4
+ POINTER,
5
+ c_int,
6
+ c_longlong,
7
+ c_void_p,
8
+ cast,
9
+ create_string_buffer,
10
+ )
11
+
12
+ import libarchive
13
+ import libarchive.ffi as ffi
14
+
15
+ from fsspec import open_files
16
+ from fsspec.archive import AbstractArchiveFileSystem
17
+ from fsspec.implementations.memory import MemoryFile
18
+ from fsspec.utils import DEFAULT_BLOCK_SIZE
19
+
20
+ # Libarchive requires seekable files or memory only for certain archive
21
+ # types. However, since we read the directory first to cache the contents
22
+ # and also allow random access to any file, the file-like object needs
23
+ # to be seekable no matter what.
24
+
25
+ # Seek call-backs (not provided in the libarchive python wrapper)
26
+ SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
27
+ read_set_seek_callback = ffi.ffi(
28
+ "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
29
+ )
30
+ new_api = hasattr(ffi, "NO_OPEN_CB")
31
+
32
+
33
+ @contextmanager
34
+ def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
35
+ """Read an archive from a seekable file-like object.
36
+
37
+ The `file` object must support the standard `readinto` and 'seek' methods.
38
+ """
39
+ buf = create_string_buffer(block_size)
40
+ buf_p = cast(buf, c_void_p)
41
+
42
+ def read_func(archive_p, context, ptrptr):
43
+ # readinto the buffer, returns number of bytes read
44
+ length = file.readinto(buf)
45
+ # write the address of the buffer into the pointer
46
+ ptrptr = cast(ptrptr, POINTER(c_void_p))
47
+ ptrptr[0] = buf_p
48
+ # tell libarchive how much data was written into the buffer
49
+ return length
50
+
51
+ def seek_func(archive_p, context, offset, whence):
52
+ file.seek(offset, whence)
53
+ # tell libarchvie the current position
54
+ return file.tell()
55
+
56
+ read_cb = ffi.READ_CALLBACK(read_func)
57
+ seek_cb = SEEK_CALLBACK(seek_func)
58
+
59
+ if new_api:
60
+ open_cb = ffi.NO_OPEN_CB
61
+ close_cb = ffi.NO_CLOSE_CB
62
+ else:
63
+ open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
64
+ close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
65
+
66
+ with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
67
+ read_set_seek_callback(archive_p, seek_cb)
68
+ ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
69
+ yield libarchive.read.ArchiveRead(archive_p)
70
+
71
+
72
+ class LibArchiveFileSystem(AbstractArchiveFileSystem):
73
+ """Compressed archives as a file-system (read-only)
74
+
75
+ Supports the following formats:
76
+ tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
77
+ Microsoft CAB, 7-Zip, WARC
78
+
79
+ See the libarchive documentation for further restrictions.
80
+ https://www.libarchive.org/
81
+
82
+ Keeps file object open while instance lives. It only works in seekable
83
+ file-like objects. In case the filesystem does not support this kind of
84
+ file object, it is recommended to cache locally.
85
+
86
+ This class is pickleable, but not necessarily thread-safe (depends on the
87
+ platform). See libarchive documentation for details.
88
+ """
89
+
90
+ root_marker = ""
91
+ protocol = "libarchive"
92
+ cachable = False
93
+
94
+ def __init__(
95
+ self,
96
+ fo="",
97
+ mode="r",
98
+ target_protocol=None,
99
+ target_options=None,
100
+ block_size=DEFAULT_BLOCK_SIZE,
101
+ **kwargs,
102
+ ):
103
+ """
104
+ Parameters
105
+ ----------
106
+ fo: str or file-like
107
+ Contains ZIP, and must exist. If a str, will fetch file using
108
+ :meth:`~fsspec.open_files`, which must return one file exactly.
109
+ mode: str
110
+ Currently, only 'r' accepted
111
+ target_protocol: str (optional)
112
+ If ``fo`` is a string, this value can be used to override the
113
+ FS protocol inferred from a URL
114
+ target_options: dict (optional)
115
+ Kwargs passed when instantiating the target FS, if ``fo`` is
116
+ a string.
117
+ """
118
+ super().__init__(self, **kwargs)
119
+ if mode != "r":
120
+ raise ValueError("Only read from archive files accepted")
121
+ if isinstance(fo, str):
122
+ files = open_files(fo, protocol=target_protocol, **(target_options or {}))
123
+ if len(files) != 1:
124
+ raise ValueError(
125
+ f'Path "{fo}" did not resolve to exactly one file: "{files}"'
126
+ )
127
+ fo = files[0]
128
+ self.of = fo
129
+ self.fo = fo.__enter__() # the whole instance is a context
130
+ self.block_size = block_size
131
+ self.dir_cache = None
132
+
133
+ @contextmanager
134
+ def _open_archive(self):
135
+ self.fo.seek(0)
136
+ with custom_reader(self.fo, block_size=self.block_size) as arc:
137
+ yield arc
138
+
139
+ @classmethod
140
+ def _strip_protocol(cls, path):
141
+ # file paths are always relative to the archive root
142
+ return super()._strip_protocol(path).lstrip("/")
143
+
144
+ def _get_dirs(self):
145
+ fields = {
146
+ "name": "pathname",
147
+ "size": "size",
148
+ "created": "ctime",
149
+ "mode": "mode",
150
+ "uid": "uid",
151
+ "gid": "gid",
152
+ "mtime": "mtime",
153
+ }
154
+
155
+ if self.dir_cache is not None:
156
+ return
157
+
158
+ self.dir_cache = {}
159
+ list_names = []
160
+ with self._open_archive() as arc:
161
+ for entry in arc:
162
+ if not entry.isdir and not entry.isfile:
163
+ # Skip symbolic links, fifo entries, etc.
164
+ continue
165
+ self.dir_cache.update(
166
+ {
167
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
168
+ for dirname in self._all_dirnames(set(entry.name))
169
+ }
170
+ )
171
+ f = {key: getattr(entry, fields[key]) for key in fields}
172
+ f["type"] = "directory" if entry.isdir else "file"
173
+ list_names.append(entry.name)
174
+
175
+ self.dir_cache[f["name"]] = f
176
+ # libarchive does not seem to return an entry for the directories (at least
177
+ # not in all formats), so get the directories names from the files names
178
+ self.dir_cache.update(
179
+ {
180
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
181
+ for dirname in self._all_dirnames(list_names)
182
+ }
183
+ )
184
+
185
+ def _open(
186
+ self,
187
+ path,
188
+ mode="rb",
189
+ block_size=None,
190
+ autocommit=True,
191
+ cache_options=None,
192
+ **kwargs,
193
+ ):
194
+ path = self._strip_protocol(path)
195
+ if mode != "rb":
196
+ raise NotImplementedError
197
+
198
+ data = bytes()
199
+ with self._open_archive() as arc:
200
+ for entry in arc:
201
+ if entry.pathname != path:
202
+ continue
203
+
204
+ if entry.size == 0:
205
+ # empty file, so there are no blocks
206
+ break
207
+
208
+ for block in entry.get_blocks(entry.size):
209
+ data = block
210
+ break
211
+ else:
212
+ raise ValueError
213
+ return MemoryFile(fs=self, path=path, data=data)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import io
3
+ import logging
4
+ import os
5
+ import os.path as osp
6
+ import shutil
7
+ import stat
8
+ import tempfile
9
+
10
+ from fsspec import AbstractFileSystem
11
+ from fsspec.compression import compr
12
+ from fsspec.core import get_compression
13
+ from fsspec.utils import isfilelike, stringify_path
14
+
15
+ logger = logging.getLogger("fsspec.local")
16
+
17
+
18
+ class LocalFileSystem(AbstractFileSystem):
19
+ """Interface to files on local storage
20
+
21
+ Parameters
22
+ ----------
23
+ auto_mkdir: bool
24
+ Whether, when opening a file, the directory containing it should
25
+ be created (if it doesn't already exist). This is assumed by pyarrow
26
+ code.
27
+ """
28
+
29
+ root_marker = "/"
30
+ protocol = "file", "local"
31
+ local_file = True
32
+
33
+ def __init__(self, auto_mkdir=False, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.auto_mkdir = auto_mkdir
36
+
37
+ @property
38
+ def fsid(self):
39
+ return "local"
40
+
41
+ def mkdir(self, path, create_parents=True, **kwargs):
42
+ path = self._strip_protocol(path)
43
+ if self.exists(path):
44
+ raise FileExistsError(path)
45
+ if create_parents:
46
+ self.makedirs(path, exist_ok=True)
47
+ else:
48
+ os.mkdir(path, **kwargs)
49
+
50
+ def makedirs(self, path, exist_ok=False):
51
+ path = self._strip_protocol(path)
52
+ os.makedirs(path, exist_ok=exist_ok)
53
+
54
+ def rmdir(self, path):
55
+ path = self._strip_protocol(path)
56
+ os.rmdir(path)
57
+
58
+ def ls(self, path, detail=False, **kwargs):
59
+ path = self._strip_protocol(path)
60
+ info = self.info(path)
61
+ if info["type"] == "directory":
62
+ with os.scandir(path) as it:
63
+ infos = [self.info(f) for f in it]
64
+ else:
65
+ infos = [info]
66
+
67
+ if not detail:
68
+ return [i["name"] for i in infos]
69
+ return infos
70
+
71
+ def info(self, path, **kwargs):
72
+ if isinstance(path, os.DirEntry):
73
+ # scandir DirEntry
74
+ out = path.stat(follow_symlinks=False)
75
+ link = path.is_symlink()
76
+ if path.is_dir(follow_symlinks=False):
77
+ t = "directory"
78
+ elif path.is_file(follow_symlinks=False):
79
+ t = "file"
80
+ else:
81
+ t = "other"
82
+ path = self._strip_protocol(path.path)
83
+ else:
84
+ # str or path-like
85
+ path = self._strip_protocol(path)
86
+ out = os.stat(path, follow_symlinks=False)
87
+ link = stat.S_ISLNK(out.st_mode)
88
+ if link:
89
+ out = os.stat(path, follow_symlinks=True)
90
+ if stat.S_ISDIR(out.st_mode):
91
+ t = "directory"
92
+ elif stat.S_ISREG(out.st_mode):
93
+ t = "file"
94
+ else:
95
+ t = "other"
96
+ result = {
97
+ "name": path,
98
+ "size": out.st_size,
99
+ "type": t,
100
+ "created": out.st_ctime,
101
+ "islink": link,
102
+ }
103
+ for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
104
+ result[field] = getattr(out, f"st_{field}")
105
+ if result["islink"]:
106
+ result["destination"] = os.readlink(path)
107
+ try:
108
+ out2 = os.stat(path, follow_symlinks=True)
109
+ result["size"] = out2.st_size
110
+ except OSError:
111
+ result["size"] = 0
112
+ return result
113
+
114
+ def lexists(self, path, **kwargs):
115
+ return osp.lexists(path)
116
+
117
+ def cp_file(self, path1, path2, **kwargs):
118
+ path1 = self._strip_protocol(path1)
119
+ path2 = self._strip_protocol(path2)
120
+ if self.auto_mkdir:
121
+ self.makedirs(self._parent(path2), exist_ok=True)
122
+ if self.isfile(path1):
123
+ shutil.copyfile(path1, path2)
124
+ elif self.isdir(path1):
125
+ self.mkdirs(path2, exist_ok=True)
126
+ else:
127
+ raise FileNotFoundError(path1)
128
+
129
+ def isfile(self, path):
130
+ path = self._strip_protocol(path)
131
+ return os.path.isfile(path)
132
+
133
+ def isdir(self, path):
134
+ path = self._strip_protocol(path)
135
+ return os.path.isdir(path)
136
+
137
+ def get_file(self, path1, path2, callback=None, **kwargs):
138
+ if isfilelike(path2):
139
+ with open(path1, "rb") as f:
140
+ shutil.copyfileobj(f, path2)
141
+ else:
142
+ return self.cp_file(path1, path2, **kwargs)
143
+
144
+ def put_file(self, path1, path2, callback=None, **kwargs):
145
+ return self.cp_file(path1, path2, **kwargs)
146
+
147
+ def mv(self, path1, path2, **kwargs):
148
+ path1 = self._strip_protocol(path1)
149
+ path2 = self._strip_protocol(path2)
150
+ shutil.move(path1, path2)
151
+
152
+ def link(self, src, dst, **kwargs):
153
+ src = self._strip_protocol(src)
154
+ dst = self._strip_protocol(dst)
155
+ os.link(src, dst, **kwargs)
156
+
157
+ def symlink(self, src, dst, **kwargs):
158
+ src = self._strip_protocol(src)
159
+ dst = self._strip_protocol(dst)
160
+ os.symlink(src, dst, **kwargs)
161
+
162
+ def islink(self, path) -> bool:
163
+ return os.path.islink(self._strip_protocol(path))
164
+
165
+ def rm_file(self, path):
166
+ os.remove(self._strip_protocol(path))
167
+
168
+ def rm(self, path, recursive=False, maxdepth=None):
169
+ if not isinstance(path, list):
170
+ path = [path]
171
+
172
+ for p in path:
173
+ p = self._strip_protocol(p)
174
+ if self.isdir(p):
175
+ if not recursive:
176
+ raise ValueError("Cannot delete directory, set recursive=True")
177
+ if osp.abspath(p) == os.getcwd():
178
+ raise ValueError("Cannot delete current working directory")
179
+ shutil.rmtree(p)
180
+ else:
181
+ os.remove(p)
182
+
183
+ def unstrip_protocol(self, name):
184
+ name = self._strip_protocol(name) # normalise for local/win/...
185
+ return f"file://{name}"
186
+
187
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
188
+ path = self._strip_protocol(path)
189
+ if self.auto_mkdir and "w" in mode:
190
+ self.makedirs(self._parent(path), exist_ok=True)
191
+ return LocalFileOpener(path, mode, fs=self, **kwargs)
192
+
193
+ def touch(self, path, truncate=True, **kwargs):
194
+ path = self._strip_protocol(path)
195
+ if self.auto_mkdir:
196
+ self.makedirs(self._parent(path), exist_ok=True)
197
+ if self.exists(path):
198
+ os.utime(path, None)
199
+ else:
200
+ open(path, "a").close()
201
+ if truncate:
202
+ os.truncate(path, 0)
203
+
204
+ def created(self, path):
205
+ info = self.info(path=path)
206
+ return datetime.datetime.fromtimestamp(
207
+ info["created"], tz=datetime.timezone.utc
208
+ )
209
+
210
+ def modified(self, path):
211
+ info = self.info(path=path)
212
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
213
+
214
+ @classmethod
215
+ def _parent(cls, path):
216
+ path = cls._strip_protocol(path)
217
+ if os.sep == "/":
218
+ # posix native
219
+ return path.rsplit("/", 1)[0] or "/"
220
+ else:
221
+ # NT
222
+ path_ = path.rsplit("/", 1)[0]
223
+ if len(path_) <= 3:
224
+ if path_[1:2] == ":":
225
+ # nt root (something like c:/)
226
+ return path_[0] + ":/"
227
+ # More cases may be required here
228
+ return path_
229
+
230
+ @classmethod
231
+ def _strip_protocol(cls, path):
232
+ path = stringify_path(path)
233
+ if path.startswith("file://"):
234
+ path = path[7:]
235
+ elif path.startswith("file:"):
236
+ path = path[5:]
237
+ elif path.startswith("local://"):
238
+ path = path[8:]
239
+ elif path.startswith("local:"):
240
+ path = path[6:]
241
+
242
+ path = make_path_posix(path)
243
+ if os.sep != "/":
244
+ # This code-path is a stripped down version of
245
+ # > drive, path = ntpath.splitdrive(path)
246
+ if path[1:2] == ":":
247
+ # Absolute drive-letter path, e.g. X:\Windows
248
+ # Relative path with drive, e.g. X:Windows
249
+ drive, path = path[:2], path[2:]
250
+ elif path[:2] == "//":
251
+ # UNC drives, e.g. \\server\share or \\?\UNC\server\share
252
+ # Device drives, e.g. \\.\device or \\?\device
253
+ if (index1 := path.find("/", 2)) == -1 or (
254
+ index2 := path.find("/", index1 + 1)
255
+ ) == -1:
256
+ drive, path = path, ""
257
+ else:
258
+ drive, path = path[:index2], path[index2:]
259
+ else:
260
+ # Relative path, e.g. Windows
261
+ drive = ""
262
+
263
+ path = path.rstrip("/") or cls.root_marker
264
+ return drive + path
265
+
266
+ else:
267
+ return path.rstrip("/") or cls.root_marker
268
+
269
+ def _isfilestore(self):
270
+ # Inheriting from DaskFileSystem makes this False (S3, etc. were)
271
+ # the original motivation. But we are a posix-like file system.
272
+ # See https://github.com/dask/dask/issues/5526
273
+ return True
274
+
275
+ def chmod(self, path, mode):
276
+ path = stringify_path(path)
277
+ return os.chmod(path, mode)
278
+
279
+
280
+ def make_path_posix(path):
281
+ """Make path generic and absolute for current OS"""
282
+ if not isinstance(path, str):
283
+ if isinstance(path, (list, set, tuple)):
284
+ return type(path)(make_path_posix(p) for p in path)
285
+ else:
286
+ path = stringify_path(path)
287
+ if not isinstance(path, str):
288
+ raise TypeError(f"could not convert {path!r} to string")
289
+ if os.sep == "/":
290
+ # Native posix
291
+ if path.startswith("/"):
292
+ # most common fast case for posix
293
+ return path
294
+ elif path.startswith("~"):
295
+ return osp.expanduser(path)
296
+ elif path.startswith("./"):
297
+ path = path[2:]
298
+ elif path == ".":
299
+ path = ""
300
+ return f"{os.getcwd()}/{path}"
301
+ else:
302
+ # NT handling
303
+ if path[0:1] == "/" and path[2:3] == ":":
304
+ # path is like "/c:/local/path"
305
+ path = path[1:]
306
+ if path[1:2] == ":":
307
+ # windows full path like "C:\\local\\path"
308
+ if len(path) <= 3:
309
+ # nt root (something like c:/)
310
+ return path[0] + ":/"
311
+ path = path.replace("\\", "/")
312
+ return path
313
+ elif path[0:1] == "~":
314
+ return make_path_posix(osp.expanduser(path))
315
+ elif path.startswith(("\\\\", "//")):
316
+ # windows UNC/DFS-style paths
317
+ return "//" + path[2:].replace("\\", "/")
318
+ elif path.startswith(("\\", "/")):
319
+ # windows relative path with root
320
+ path = path.replace("\\", "/")
321
+ return f"{osp.splitdrive(os.getcwd())[0]}{path}"
322
+ else:
323
+ path = path.replace("\\", "/")
324
+ if path.startswith("./"):
325
+ path = path[2:]
326
+ elif path == ".":
327
+ path = ""
328
+ return f"{make_path_posix(os.getcwd())}/{path}"
329
+
330
+
331
+ def trailing_sep(path):
332
+ """Return True if the path ends with a path separator.
333
+
334
+ A forward slash is always considered a path separator, even on Operating
335
+ Systems that normally use a backslash.
336
+ """
337
+ # TODO: if all incoming paths were posix-compliant then separator would
338
+ # always be a forward slash, simplifying this function.
339
+ # See https://github.com/fsspec/filesystem_spec/pull/1250
340
+ return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
341
+
342
+
343
+ class LocalFileOpener(io.IOBase):
344
+ def __init__(
345
+ self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
346
+ ):
347
+ logger.debug("open file: %s", path)
348
+ self.path = path
349
+ self.mode = mode
350
+ self.fs = fs
351
+ self.f = None
352
+ self.autocommit = autocommit
353
+ self.compression = get_compression(path, compression)
354
+ self.blocksize = io.DEFAULT_BUFFER_SIZE
355
+ self._open()
356
+
357
+ def _open(self):
358
+ if self.f is None or self.f.closed:
359
+ if self.autocommit or "w" not in self.mode:
360
+ self.f = open(self.path, mode=self.mode)
361
+ if self.compression:
362
+ compress = compr[self.compression]
363
+ self.f = compress(self.f, mode=self.mode)
364
+ else:
365
+ # TODO: check if path is writable?
366
+ i, name = tempfile.mkstemp()
367
+ os.close(i) # we want normal open and normal buffered file
368
+ self.temp = name
369
+ self.f = open(name, mode=self.mode)
370
+ if "w" not in self.mode:
371
+ self.size = self.f.seek(0, 2)
372
+ self.f.seek(0)
373
+ self.f.size = self.size
374
+
375
+ def _fetch_range(self, start, end):
376
+ # probably only used by cached FS
377
+ if "r" not in self.mode:
378
+ raise ValueError
379
+ self._open()
380
+ self.f.seek(start)
381
+ return self.f.read(end - start)
382
+
383
+ def __setstate__(self, state):
384
+ self.f = None
385
+ loc = state.pop("loc", None)
386
+ self.__dict__.update(state)
387
+ if "r" in state["mode"]:
388
+ self.f = None
389
+ self._open()
390
+ self.f.seek(loc)
391
+
392
+ def __getstate__(self):
393
+ d = self.__dict__.copy()
394
+ d.pop("f")
395
+ if "r" in self.mode:
396
+ d["loc"] = self.f.tell()
397
+ else:
398
+ if not self.f.closed:
399
+ raise ValueError("Cannot serialise open write-mode local file")
400
+ return d
401
+
402
+ def commit(self):
403
+ if self.autocommit:
404
+ raise RuntimeError("Can only commit if not already set to autocommit")
405
+ shutil.move(self.temp, self.path)
406
+
407
+ def discard(self):
408
+ if self.autocommit:
409
+ raise RuntimeError("Cannot discard if set to autocommit")
410
+ os.remove(self.temp)
411
+
412
+ def readable(self) -> bool:
413
+ return True
414
+
415
+ def writable(self) -> bool:
416
+ return "r" not in self.mode
417
+
418
+ def read(self, *args, **kwargs):
419
+ return self.f.read(*args, **kwargs)
420
+
421
+ def write(self, *args, **kwargs):
422
+ return self.f.write(*args, **kwargs)
423
+
424
+ def tell(self, *args, **kwargs):
425
+ return self.f.tell(*args, **kwargs)
426
+
427
+ def seek(self, *args, **kwargs):
428
+ return self.f.seek(*args, **kwargs)
429
+
430
+ def seekable(self, *args, **kwargs):
431
+ return self.f.seekable(*args, **kwargs)
432
+
433
+ def readline(self, *args, **kwargs):
434
+ return self.f.readline(*args, **kwargs)
435
+
436
+ def readlines(self, *args, **kwargs):
437
+ return self.f.readlines(*args, **kwargs)
438
+
439
+ def close(self):
440
+ return self.f.close()
441
+
442
+ def truncate(self, size=None) -> int:
443
+ return self.f.truncate(size)
444
+
445
+ @property
446
+ def closed(self):
447
+ return self.f.closed
448
+
449
+ def fileno(self):
450
+ return self.raw.fileno()
451
+
452
+ def flush(self) -> None:
453
+ self.f.flush()
454
+
455
+ def __iter__(self):
456
+ return self.f.__iter__()
457
+
458
+ def __getattr__(self, item):
459
+ return getattr(self.f, item)
460
+
461
+ def __enter__(self):
462
+ self._incontext = True
463
+ return self
464
+
465
+ def __exit__(self, exc_type, exc_value, traceback):
466
+ self._incontext = False
467
+ self.f.__exit__(exc_type, exc_value, traceback)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from datetime import datetime, timezone
5
+ from errno import ENOTEMPTY
6
+ from io import BytesIO
7
+ from pathlib import PurePath, PureWindowsPath
8
+ from typing import Any, ClassVar
9
+
10
+ from fsspec import AbstractFileSystem
11
+ from fsspec.implementations.local import LocalFileSystem
12
+ from fsspec.utils import stringify_path
13
+
14
+ logger = logging.getLogger("fsspec.memoryfs")
15
+
16
+
17
+ class MemoryFileSystem(AbstractFileSystem):
18
+ """A filesystem based on a dict of BytesIO objects
19
+
20
+ This is a global filesystem so instances of this class all point to the same
21
+ in memory filesystem.
22
+ """
23
+
24
+ store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
25
+ pseudo_dirs = [""] # global, do not overwrite!
26
+ protocol = "memory"
27
+ root_marker = "/"
28
+
29
+ @classmethod
30
+ def _strip_protocol(cls, path):
31
+ if isinstance(path, PurePath):
32
+ if isinstance(path, PureWindowsPath):
33
+ return LocalFileSystem._strip_protocol(path)
34
+ else:
35
+ path = stringify_path(path)
36
+
37
+ if path.startswith("memory://"):
38
+ path = path[len("memory://") :]
39
+ if "::" in path or "://" in path:
40
+ return path.rstrip("/")
41
+ path = path.lstrip("/").rstrip("/")
42
+ return "/" + path if path else ""
43
+
44
+ def ls(self, path, detail=True, **kwargs):
45
+ path = self._strip_protocol(path)
46
+ if path in self.store:
47
+ # there is a key with this exact name
48
+ if not detail:
49
+ return [path]
50
+ return [
51
+ {
52
+ "name": path,
53
+ "size": self.store[path].size,
54
+ "type": "file",
55
+ "created": self.store[path].created.timestamp(),
56
+ }
57
+ ]
58
+ paths = set()
59
+ starter = path + "/"
60
+ out = []
61
+ for p2 in tuple(self.store):
62
+ if p2.startswith(starter):
63
+ if "/" not in p2[len(starter) :]:
64
+ # exact child
65
+ out.append(
66
+ {
67
+ "name": p2,
68
+ "size": self.store[p2].size,
69
+ "type": "file",
70
+ "created": self.store[p2].created.timestamp(),
71
+ }
72
+ )
73
+ elif len(p2) > len(starter):
74
+ # implied child directory
75
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
76
+ if ppath not in paths:
77
+ out = out or []
78
+ out.append(
79
+ {
80
+ "name": ppath,
81
+ "size": 0,
82
+ "type": "directory",
83
+ }
84
+ )
85
+ paths.add(ppath)
86
+ for p2 in self.pseudo_dirs:
87
+ if p2.startswith(starter):
88
+ if "/" not in p2[len(starter) :]:
89
+ # exact child pdir
90
+ if p2 not in paths:
91
+ out.append({"name": p2, "size": 0, "type": "directory"})
92
+ paths.add(p2)
93
+ else:
94
+ # directory implied by deeper pdir
95
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
96
+ if ppath not in paths:
97
+ out.append({"name": ppath, "size": 0, "type": "directory"})
98
+ paths.add(ppath)
99
+ if not out:
100
+ if path in self.pseudo_dirs:
101
+ # empty dir
102
+ return []
103
+ raise FileNotFoundError(path)
104
+ if detail:
105
+ return out
106
+ return sorted([f["name"] for f in out])
107
+
108
+ def mkdir(self, path, create_parents=True, **kwargs):
109
+ path = self._strip_protocol(path)
110
+ if path in self.store or path in self.pseudo_dirs:
111
+ raise FileExistsError(path)
112
+ if self._parent(path).strip("/") and self.isfile(self._parent(path)):
113
+ raise NotADirectoryError(self._parent(path))
114
+ if create_parents and self._parent(path).strip("/"):
115
+ try:
116
+ self.mkdir(self._parent(path), create_parents, **kwargs)
117
+ except FileExistsError:
118
+ pass
119
+ if path and path not in self.pseudo_dirs:
120
+ self.pseudo_dirs.append(path)
121
+
122
+ def makedirs(self, path, exist_ok=False):
123
+ try:
124
+ self.mkdir(path, create_parents=True)
125
+ except FileExistsError:
126
+ if not exist_ok:
127
+ raise
128
+
129
+ def pipe_file(self, path, value, **kwargs):
130
+ """Set the bytes of given file
131
+
132
+ Avoids copies of the data if possible
133
+ """
134
+ self.open(path, "wb", data=value)
135
+
136
+ def rmdir(self, path):
137
+ path = self._strip_protocol(path)
138
+ if path == "":
139
+ # silently avoid deleting FS root
140
+ return
141
+ if path in self.pseudo_dirs:
142
+ if not self.ls(path):
143
+ self.pseudo_dirs.remove(path)
144
+ else:
145
+ raise OSError(ENOTEMPTY, "Directory not empty", path)
146
+ else:
147
+ raise FileNotFoundError(path)
148
+
149
+ def info(self, path, **kwargs):
150
+ logger.debug("info: %s", path)
151
+ path = self._strip_protocol(path)
152
+ if path in self.pseudo_dirs or any(
153
+ p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
154
+ ):
155
+ return {
156
+ "name": path,
157
+ "size": 0,
158
+ "type": "directory",
159
+ }
160
+ elif path in self.store:
161
+ filelike = self.store[path]
162
+ return {
163
+ "name": path,
164
+ "size": filelike.size,
165
+ "type": "file",
166
+ "created": getattr(filelike, "created", None),
167
+ }
168
+ else:
169
+ raise FileNotFoundError(path)
170
+
171
+ def _open(
172
+ self,
173
+ path,
174
+ mode="rb",
175
+ block_size=None,
176
+ autocommit=True,
177
+ cache_options=None,
178
+ **kwargs,
179
+ ):
180
+ path = self._strip_protocol(path)
181
+ if path in self.pseudo_dirs:
182
+ raise IsADirectoryError(path)
183
+ parent = path
184
+ while len(parent) > 1:
185
+ parent = self._parent(parent)
186
+ if self.isfile(parent):
187
+ raise FileExistsError(parent)
188
+ if mode in ["rb", "ab", "r+b"]:
189
+ if path in self.store:
190
+ f = self.store[path]
191
+ if mode == "ab":
192
+ # position at the end of file
193
+ f.seek(0, 2)
194
+ else:
195
+ # position at the beginning of file
196
+ f.seek(0)
197
+ return f
198
+ else:
199
+ raise FileNotFoundError(path)
200
+ elif mode == "wb":
201
+ m = MemoryFile(self, path, kwargs.get("data"))
202
+ if not self._intrans:
203
+ m.commit()
204
+ return m
205
+ else:
206
+ name = self.__class__.__name__
207
+ raise ValueError(f"unsupported file mode for {name}: {mode!r}")
208
+
209
+ def cp_file(self, path1, path2, **kwargs):
210
+ path1 = self._strip_protocol(path1)
211
+ path2 = self._strip_protocol(path2)
212
+ if self.isfile(path1):
213
+ self.store[path2] = MemoryFile(
214
+ self, path2, self.store[path1].getvalue()
215
+ ) # implicit copy
216
+ elif self.isdir(path1):
217
+ if path2 not in self.pseudo_dirs:
218
+ self.pseudo_dirs.append(path2)
219
+ else:
220
+ raise FileNotFoundError(path1)
221
+
222
+ def cat_file(self, path, start=None, end=None, **kwargs):
223
+ logger.debug("cat: %s", path)
224
+ path = self._strip_protocol(path)
225
+ try:
226
+ return bytes(self.store[path].getbuffer()[start:end])
227
+ except KeyError:
228
+ raise FileNotFoundError(path)
229
+
230
+ def _rm(self, path):
231
+ path = self._strip_protocol(path)
232
+ try:
233
+ del self.store[path]
234
+ except KeyError as e:
235
+ raise FileNotFoundError(path) from e
236
+
237
+ def modified(self, path):
238
+ path = self._strip_protocol(path)
239
+ try:
240
+ return self.store[path].modified
241
+ except KeyError:
242
+ raise FileNotFoundError(path)
243
+
244
+ def created(self, path):
245
+ path = self._strip_protocol(path)
246
+ try:
247
+ return self.store[path].created
248
+ except KeyError:
249
+ raise FileNotFoundError(path)
250
+
251
+ def rm(self, path, recursive=False, maxdepth=None):
252
+ if isinstance(path, str):
253
+ path = self._strip_protocol(path)
254
+ else:
255
+ path = [self._strip_protocol(p) for p in path]
256
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
257
+ for p in reversed(paths):
258
+ # If the expanded path doesn't exist, it is only because the expanded
259
+ # path was a directory that does not exist in self.pseudo_dirs. This
260
+ # is possible if you directly create files without making the
261
+ # directories first.
262
+ if not self.exists(p):
263
+ continue
264
+ if self.isfile(p):
265
+ self.rm_file(p)
266
+ else:
267
+ self.rmdir(p)
268
+
269
+
270
+ class MemoryFile(BytesIO):
271
+ """A BytesIO which can't close and works as a context manager
272
+
273
+ Can initialise with data. Each path should only be active once at any moment.
274
+
275
+ No need to provide fs, path if auto-committing (default)
276
+ """
277
+
278
+ def __init__(self, fs=None, path=None, data=None):
279
+ logger.debug("open file %s", path)
280
+ self.fs = fs
281
+ self.path = path
282
+ self.created = datetime.now(tz=timezone.utc)
283
+ self.modified = datetime.now(tz=timezone.utc)
284
+ if data:
285
+ super().__init__(data)
286
+ self.seek(0)
287
+
288
+ @property
289
+ def size(self):
290
+ return self.getbuffer().nbytes
291
+
292
+ def __enter__(self):
293
+ return self
294
+
295
+ def close(self):
296
+ pass
297
+
298
+ def discard(self):
299
+ pass
300
+
301
+ def commit(self):
302
+ self.fs.store[self.path] = self
303
+ self.modified = datetime.now(tz=timezone.utc)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py ADDED
@@ -0,0 +1,1160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import collections
3
+ import io
4
+ import itertools
5
+ import logging
6
+ import math
7
+ import os
8
+ from functools import lru_cache
9
+ from typing import TYPE_CHECKING
10
+
11
+ import fsspec.core
12
+
13
+ try:
14
+ import ujson as json
15
+ except ImportError:
16
+ if not TYPE_CHECKING:
17
+ import json
18
+
19
+ from ..asyn import AsyncFileSystem
20
+ from ..callbacks import DEFAULT_CALLBACK
21
+ from ..core import filesystem, open, split_protocol
22
+ from ..utils import isfilelike, merge_offset_ranges, other_paths
23
+
24
+ logger = logging.getLogger("fsspec.reference")
25
+
26
+
27
+ class ReferenceNotReachable(RuntimeError):
28
+ def __init__(self, reference, target, *args):
29
+ super().__init__(*args)
30
+ self.reference = reference
31
+ self.target = target
32
+
33
+ def __str__(self):
34
+ return f'Reference "{self.reference}" failed to fetch target {self.target}'
35
+
36
+
37
+ def _first(d):
38
+ return list(d.values())[0]
39
+
40
+
41
+ def _prot_in_references(path, references):
42
+ ref = references.get(path)
43
+ if isinstance(ref, (list, tuple)):
44
+ return split_protocol(ref[0])[0] if ref[0] else ref[0]
45
+
46
+
47
+ def _protocol_groups(paths, references):
48
+ if isinstance(paths, str):
49
+ return {_prot_in_references(paths, references): [paths]}
50
+ out = {}
51
+ for path in paths:
52
+ protocol = _prot_in_references(path, references)
53
+ out.setdefault(protocol, []).append(path)
54
+ return out
55
+
56
+
57
+ class RefsValuesView(collections.abc.ValuesView):
58
+ def __iter__(self):
59
+ for val in self._mapping.zmetadata.values():
60
+ yield json.dumps(val).encode()
61
+ yield from self._mapping._items.values()
62
+ for field in self._mapping.listdir():
63
+ chunk_sizes = self._mapping._get_chunk_sizes(field)
64
+ if len(chunk_sizes) == 0:
65
+ yield self._mapping[field + "/0"]
66
+ continue
67
+ yield from self._mapping._generate_all_records(field)
68
+
69
+
70
+ class RefsItemsView(collections.abc.ItemsView):
71
+ def __iter__(self):
72
+ return zip(self._mapping.keys(), self._mapping.values())
73
+
74
+
75
+ def ravel_multi_index(idx, sizes):
76
+ val = 0
77
+ mult = 1
78
+ for i, s in zip(idx[::-1], sizes[::-1]):
79
+ val += i * mult
80
+ mult *= s
81
+ return val
82
+
83
+
84
+ class LazyReferenceMapper(collections.abc.MutableMapping):
85
+ """This interface can be used to read/write references from Parquet stores.
86
+ It is not intended for other types of references.
87
+ It can be used with Kerchunk's MultiZarrToZarr method to combine
88
+ references into a parquet store.
89
+ Examples of this use-case can be found here:
90
+ https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
91
+
92
+ # import is class level to prevent numpy dep requirement for fsspec
93
+ @property
94
+ def np(self):
95
+ import numpy as np
96
+
97
+ return np
98
+
99
+ @property
100
+ def pd(self):
101
+ import pandas as pd
102
+
103
+ return pd
104
+
105
+ def __init__(
106
+ self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
107
+ ):
108
+ """
109
+
110
+ This instance will be writable, storing changes in memory until full partitions
111
+ are accumulated or .flush() is called.
112
+
113
+ To create an empty lazy store, use .create()
114
+
115
+ Parameters
116
+ ----------
117
+ root : str
118
+ Root of parquet store
119
+ fs : fsspec.AbstractFileSystem
120
+ fsspec filesystem object, default is local filesystem.
121
+ cache_size : int, default=128
122
+ Maximum size of LRU cache, where cache_size*record_size denotes
123
+ the total number of references that can be loaded in memory at once.
124
+ categorical_threshold : int
125
+ Encode urls as pandas.Categorical to reduce memory footprint if the ratio
126
+ of the number of unique urls to total number of refs for each variable
127
+ is greater than or equal to this number. (default 10)
128
+ """
129
+ self.root = root
130
+ self.chunk_sizes = {}
131
+ self.out_root = out_root or self.root
132
+ self.cat_thresh = categorical_threshold
133
+ self.cache_size = cache_size
134
+ self.dirs = None
135
+ self.url = self.root + "/{field}/refs.{record}.parq"
136
+ # TODO: derive fs from `root`
137
+ self.fs = fsspec.filesystem("file") if fs is None else fs
138
+
139
+ def __getattr__(self, item):
140
+ if item in ("_items", "record_size", "zmetadata"):
141
+ self.setup()
142
+ # avoid possible recursion if setup fails somehow
143
+ return self.__dict__[item]
144
+ raise AttributeError(item)
145
+
146
+ def setup(self):
147
+ self._items = {}
148
+ self._items[".zmetadata"] = self.fs.cat_file(
149
+ "/".join([self.root, ".zmetadata"])
150
+ )
151
+ met = json.loads(self._items[".zmetadata"])
152
+ self.record_size = met["record_size"]
153
+ self.zmetadata = met["metadata"]
154
+
155
+ # Define function to open and decompress refs
156
+ @lru_cache(maxsize=self.cache_size)
157
+ def open_refs(field, record):
158
+ """cached parquet file loader"""
159
+ path = self.url.format(field=field, record=record)
160
+ data = io.BytesIO(self.fs.cat_file(path))
161
+ df = self.pd.read_parquet(data, engine="fastparquet")
162
+ refs = {c: df[c].values for c in df.columns}
163
+ return refs
164
+
165
+ self.open_refs = open_refs
166
+
167
+ @staticmethod
168
+ def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
169
+ """Make empty parquet reference set
170
+
171
+ First deletes the contents of the given directory, if it exists.
172
+
173
+ Parameters
174
+ ----------
175
+ root: str
176
+ Directory to contain the output; will be created
177
+ storage_options: dict | None
178
+ For making the filesystem to use for writing is fs is None
179
+ fs: FileSystem | None
180
+ Filesystem for writing
181
+ record_size: int
182
+ Number of references per parquet file
183
+ kwargs: passed to __init__
184
+
185
+ Returns
186
+ -------
187
+ LazyReferenceMapper instance
188
+ """
189
+ met = {"metadata": {}, "record_size": record_size}
190
+ if fs is None:
191
+ fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
192
+ if fs.exists(root):
193
+ fs.rm(root, recursive=True)
194
+ fs.makedirs(root, exist_ok=True)
195
+ fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
196
+ return LazyReferenceMapper(root, fs, **kwargs)
197
+
198
+ def listdir(self, basename=True):
199
+ """List top-level directories"""
200
+ # cache me?
201
+ if self.dirs is None:
202
+ dirs = [p.split("/", 1)[0] for p in self.zmetadata]
203
+ self.dirs = {p for p in dirs if p and not p.startswith(".")}
204
+ listing = self.dirs
205
+ if basename:
206
+ listing = [os.path.basename(path) for path in listing]
207
+ return listing
208
+
209
+ def ls(self, path="", detail=True):
210
+ """Shortcut file listings"""
211
+ if not path:
212
+ dirnames = self.listdir()
213
+ others = set(
214
+ [".zmetadata"]
215
+ + [name for name in self.zmetadata if "/" not in name]
216
+ + [name for name in self._items if "/" not in name]
217
+ )
218
+ if detail is False:
219
+ others.update(dirnames)
220
+ return sorted(others)
221
+ dirinfo = [
222
+ {"name": name, "type": "directory", "size": 0} for name in dirnames
223
+ ]
224
+ fileinfo = [
225
+ {
226
+ "name": name,
227
+ "type": "file",
228
+ "size": len(
229
+ json.dumps(self.zmetadata[name])
230
+ if name in self.zmetadata
231
+ else self._items[name]
232
+ ),
233
+ }
234
+ for name in others
235
+ ]
236
+ return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
237
+ parts = path.split("/", 1)
238
+ if len(parts) > 1:
239
+ raise FileNotFoundError("Cannot list within directories right now")
240
+ field = parts[0]
241
+ others = set(
242
+ [name for name in self.zmetadata if name.startswith(f"{path}/")]
243
+ + [name for name in self._items if name.startswith(f"{path}/")]
244
+ )
245
+ fileinfo = [
246
+ {
247
+ "name": name,
248
+ "type": "file",
249
+ "size": len(
250
+ json.dumps(self.zmetadata[name])
251
+ if name in self.zmetadata
252
+ else self._items[name]
253
+ ),
254
+ }
255
+ for name in others
256
+ ]
257
+ keys = self._keys_in_field(field)
258
+
259
+ if detail is False:
260
+ return list(others) + list(keys)
261
+ recs = self._generate_all_records(field)
262
+ recinfo = [
263
+ {"name": name, "type": "file", "size": rec[-1]}
264
+ for name, rec in zip(keys, recs)
265
+ if rec[0] # filters out path==None, deleted/missing
266
+ ]
267
+ return fileinfo + recinfo
268
+
269
+ def _load_one_key(self, key):
270
+ """Get the reference for one key
271
+
272
+ Returns bytes, one-element list or three-element list.
273
+ """
274
+ if key in self._items:
275
+ return self._items[key]
276
+ elif key in self.zmetadata:
277
+ return json.dumps(self.zmetadata[key]).encode()
278
+ elif "/" not in key or self._is_meta(key):
279
+ raise KeyError(key)
280
+ field, _ = key.rsplit("/", 1)
281
+ record, ri, chunk_size = self._key_to_record(key)
282
+ maybe = self._items.get((field, record), {}).get(ri, False)
283
+ if maybe is None:
284
+ # explicitly deleted
285
+ raise KeyError
286
+ elif maybe:
287
+ return maybe
288
+ elif chunk_size == 0:
289
+ return b""
290
+
291
+ # Chunk keys can be loaded from row group and cached in LRU cache
292
+ try:
293
+ refs = self.open_refs(field, record)
294
+ except (ValueError, TypeError, FileNotFoundError):
295
+ raise KeyError(key)
296
+ columns = ["path", "offset", "size", "raw"]
297
+ selection = [refs[c][ri] if c in refs else None for c in columns]
298
+ raw = selection[-1]
299
+ if raw is not None:
300
+ return raw
301
+ if selection[0] is None:
302
+ raise KeyError("This reference does not exist or has been deleted")
303
+ if selection[1:3] == [0, 0]:
304
+ # URL only
305
+ return selection[:1]
306
+ # URL, offset, size
307
+ return selection[:3]
308
+
309
+ @lru_cache(4096)
310
+ def _key_to_record(self, key):
311
+ """Details needed to construct a reference for one key"""
312
+ field, chunk = key.rsplit("/", 1)
313
+ chunk_sizes = self._get_chunk_sizes(field)
314
+ if len(chunk_sizes) == 0:
315
+ return 0, 0, 0
316
+ chunk_idx = [int(c) for c in chunk.split(".")]
317
+ chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
318
+ record = chunk_number // self.record_size
319
+ ri = chunk_number % self.record_size
320
+ return record, ri, len(chunk_sizes)
321
+
322
+ def _get_chunk_sizes(self, field):
323
+ """The number of chunks along each axis for a given field"""
324
+ if field not in self.chunk_sizes:
325
+ zarray = self.zmetadata[f"{field}/.zarray"]
326
+ size_ratio = [
327
+ math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
328
+ ]
329
+ self.chunk_sizes[field] = size_ratio or [1]
330
+ return self.chunk_sizes[field]
331
+
332
+ def _generate_record(self, field, record):
333
+ """The references for a given parquet file of a given field"""
334
+ refs = self.open_refs(field, record)
335
+ it = iter(zip(*refs.values()))
336
+ if len(refs) == 3:
337
+ # All urls
338
+ return (list(t) for t in it)
339
+ elif len(refs) == 1:
340
+ # All raws
341
+ return refs["raw"]
342
+ else:
343
+ # Mix of urls and raws
344
+ return (list(t[:3]) if not t[3] else t[3] for t in it)
345
+
346
+ def _generate_all_records(self, field):
347
+ """Load all the references within a field by iterating over the parquet files"""
348
+ nrec = 1
349
+ for ch in self._get_chunk_sizes(field):
350
+ nrec *= ch
351
+ nrec = math.ceil(nrec / self.record_size)
352
+ for record in range(nrec):
353
+ yield from self._generate_record(field, record)
354
+
355
+ def values(self):
356
+ return RefsValuesView(self)
357
+
358
+ def items(self):
359
+ return RefsItemsView(self)
360
+
361
+ def __hash__(self):
362
+ return id(self)
363
+
364
+ def __getitem__(self, key):
365
+ return self._load_one_key(key)
366
+
367
+ def __setitem__(self, key, value):
368
+ if "/" in key and not self._is_meta(key):
369
+ field, chunk = key.rsplit("/", 1)
370
+ record, i, _ = self._key_to_record(key)
371
+ subdict = self._items.setdefault((field, record), {})
372
+ subdict[i] = value
373
+ if len(subdict) == self.record_size:
374
+ self.write(field, record)
375
+ else:
376
+ # metadata or top-level
377
+ self._items[key] = value
378
+ new_value = json.loads(
379
+ value.decode() if isinstance(value, bytes) else value
380
+ )
381
+ self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
382
+
383
+ @staticmethod
384
+ def _is_meta(key):
385
+ return key.startswith(".z") or "/.z" in key
386
+
387
+ def __delitem__(self, key):
388
+ if key in self._items:
389
+ del self._items[key]
390
+ elif key in self.zmetadata:
391
+ del self.zmetadata[key]
392
+ else:
393
+ if "/" in key and not self._is_meta(key):
394
+ field, _ = key.rsplit("/", 1)
395
+ record, i, _ = self._key_to_record(key)
396
+ subdict = self._items.setdefault((field, record), {})
397
+ subdict[i] = None
398
+ if len(subdict) == self.record_size:
399
+ self.write(field, record)
400
+ else:
401
+ # metadata or top-level
402
+ self._items[key] = None
403
+
404
+ def write(self, field, record, base_url=None, storage_options=None):
405
+ # extra requirements if writing
406
+ import kerchunk.df
407
+ import numpy as np
408
+ import pandas as pd
409
+
410
+ partition = self._items[(field, record)]
411
+ original = False
412
+ if len(partition) < self.record_size:
413
+ try:
414
+ original = self.open_refs(field, record)
415
+ except IOError:
416
+ pass
417
+
418
+ if original:
419
+ paths = original["path"]
420
+ offsets = original["offset"]
421
+ sizes = original["size"]
422
+ raws = original["raw"]
423
+ else:
424
+ paths = np.full(self.record_size, np.nan, dtype="O")
425
+ offsets = np.zeros(self.record_size, dtype="int64")
426
+ sizes = np.zeros(self.record_size, dtype="int64")
427
+ raws = np.full(self.record_size, np.nan, dtype="O")
428
+ for j, data in partition.items():
429
+ if isinstance(data, list):
430
+ if (
431
+ str(paths.dtype) == "category"
432
+ and data[0] not in paths.dtype.categories
433
+ ):
434
+ paths = paths.add_categories(data[0])
435
+ paths[j] = data[0]
436
+ if len(data) > 1:
437
+ offsets[j] = data[1]
438
+ sizes[j] = data[2]
439
+ elif data is None:
440
+ # delete
441
+ paths[j] = None
442
+ offsets[j] = 0
443
+ sizes[j] = 0
444
+ raws[j] = None
445
+ else:
446
+ # this is the only call into kerchunk, could remove
447
+ raws[j] = kerchunk.df._proc_raw(data)
448
+ # TODO: only save needed columns
449
+ df = pd.DataFrame(
450
+ {
451
+ "path": paths,
452
+ "offset": offsets,
453
+ "size": sizes,
454
+ "raw": raws,
455
+ },
456
+ copy=False,
457
+ )
458
+ if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
459
+ df["path"] = df["path"].astype("category")
460
+ object_encoding = {"raw": "bytes", "path": "utf8"}
461
+ has_nulls = ["path", "raw"]
462
+
463
+ fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
464
+ self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
465
+ df.to_parquet(
466
+ fn,
467
+ engine="fastparquet",
468
+ storage_options=storage_options
469
+ or getattr(self.fs, "storage_options", None),
470
+ compression="zstd",
471
+ index=False,
472
+ stats=False,
473
+ object_encoding=object_encoding,
474
+ has_nulls=has_nulls,
475
+ # **kwargs,
476
+ )
477
+ partition.clear()
478
+ self._items.pop((field, record))
479
+
480
+ def flush(self, base_url=None, storage_options=None):
481
+ """Output any modified or deleted keys
482
+
483
+ Parameters
484
+ ----------
485
+ base_url: str
486
+ Location of the output
487
+ """
488
+ # write what we have so far and clear sub chunks
489
+ for thing in list(self._items):
490
+ if isinstance(thing, tuple):
491
+ field, record = thing
492
+ self.write(
493
+ field,
494
+ record,
495
+ base_url=base_url,
496
+ storage_options=storage_options,
497
+ )
498
+
499
+ # gather .zmetadata from self._items and write that too
500
+ for k in list(self._items):
501
+ if k != ".zmetadata" and ".z" in k:
502
+ self.zmetadata[k] = json.loads(self._items.pop(k))
503
+ met = {"metadata": self.zmetadata, "record_size": self.record_size}
504
+ self._items[".zmetadata"] = json.dumps(met).encode()
505
+ self.fs.pipe(
506
+ "/".join([base_url or self.out_root, ".zmetadata"]),
507
+ self._items[".zmetadata"],
508
+ )
509
+
510
+ # TODO: only clear those that we wrote to?
511
+ self.open_refs.cache_clear()
512
+
513
+ def __len__(self):
514
+ # Caveat: This counts expected references, not actual - but is fast
515
+ count = 0
516
+ for field in self.listdir():
517
+ if field.startswith("."):
518
+ count += 1
519
+ else:
520
+ count += math.prod(self._get_chunk_sizes(field))
521
+ count += len(self.zmetadata) # all metadata keys
522
+ # any other files not in reference partitions
523
+ count += sum(1 for _ in self._items if not isinstance(_, tuple))
524
+ return count
525
+
526
+ def __iter__(self):
527
+ # Caveat: returns only existing keys, so the number of these does not
528
+ # match len(self)
529
+ metas = set(self.zmetadata)
530
+ metas.update(self._items)
531
+ for bit in metas:
532
+ if isinstance(bit, str):
533
+ yield bit
534
+ for field in self.listdir():
535
+ for k in self._keys_in_field(field):
536
+ if k in self:
537
+ yield k
538
+
539
+ def __contains__(self, item):
540
+ try:
541
+ self._load_one_key(item)
542
+ return True
543
+ except KeyError:
544
+ return False
545
+
546
+ def _keys_in_field(self, field):
547
+ """List key names in given field
548
+
549
+ Produces strings like "field/x.y" appropriate from the chunking of the array
550
+ """
551
+ chunk_sizes = self._get_chunk_sizes(field)
552
+ if len(chunk_sizes) == 0:
553
+ yield field + "/0"
554
+ return
555
+ inds = itertools.product(*(range(i) for i in chunk_sizes))
556
+ for ind in inds:
557
+ yield field + "/" + ".".join([str(c) for c in ind])
558
+
559
+
560
+ class ReferenceFileSystem(AsyncFileSystem):
561
+ """View byte ranges of some other file as a file system
562
+ Initial version: single file system target, which must support
563
+ async, and must allow start and end args in _cat_file. Later versions
564
+ may allow multiple arbitrary URLs for the targets.
565
+ This FileSystem is read-only. It is designed to be used with async
566
+ targets (for now). This FileSystem only allows whole-file access, no
567
+ ``open``. We do not get original file details from the target FS.
568
+ Configuration is by passing a dict of references at init, or a URL to
569
+ a JSON file containing the same; this dict
570
+ can also contain concrete data for some set of paths.
571
+ Reference dict format:
572
+ {path0: bytes_data, path1: (target_url, offset, size)}
573
+ https://github.com/fsspec/kerchunk/blob/main/README.md
574
+ """
575
+
576
+ protocol = "reference"
577
+
578
+ def __init__(
579
+ self,
580
+ fo,
581
+ target=None,
582
+ ref_storage_args=None,
583
+ target_protocol=None,
584
+ target_options=None,
585
+ remote_protocol=None,
586
+ remote_options=None,
587
+ fs=None,
588
+ template_overrides=None,
589
+ simple_templates=True,
590
+ max_gap=64_000,
591
+ max_block=256_000_000,
592
+ cache_size=128,
593
+ **kwargs,
594
+ ):
595
+ """
596
+ Parameters
597
+ ----------
598
+ fo : dict or str
599
+ The set of references to use for this instance, with a structure as above.
600
+ If str referencing a JSON file, will use fsspec.open, in conjunction
601
+ with target_options and target_protocol to open and parse JSON at this
602
+ location. If a directory, then assume references are a set of parquet
603
+ files to be loaded lazily.
604
+ target : str
605
+ For any references having target_url as None, this is the default file
606
+ target to use
607
+ ref_storage_args : dict
608
+ If references is a str, use these kwargs for loading the JSON file.
609
+ Deprecated: use target_options instead.
610
+ target_protocol : str
611
+ Used for loading the reference file, if it is a path. If None, protocol
612
+ will be derived from the given path
613
+ target_options : dict
614
+ Extra FS options for loading the reference file ``fo``, if given as a path
615
+ remote_protocol : str
616
+ The protocol of the filesystem on which the references will be evaluated
617
+ (unless fs is provided). If not given, will be derived from the first
618
+ URL that has a protocol in the templates or in the references, in that
619
+ order.
620
+ remote_options : dict
621
+ kwargs to go with remote_protocol
622
+ fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
623
+ Directly provide a file system(s):
624
+ - a single filesystem instance
625
+ - a dict of protocol:filesystem, where each value is either a filesystem
626
+ instance, or a dict of kwargs that can be used to create in
627
+ instance for the given protocol
628
+
629
+ If this is given, remote_options and remote_protocol are ignored.
630
+ template_overrides : dict
631
+ Swap out any templates in the references file with these - useful for
632
+ testing.
633
+ simple_templates: bool
634
+ Whether templates can be processed with simple replace (True) or if
635
+ jinja is needed (False, much slower). All reference sets produced by
636
+ ``kerchunk`` are simple in this sense, but the spec allows for complex.
637
+ max_gap, max_block: int
638
+ For merging multiple concurrent requests to the same remote file.
639
+ Neighboring byte ranges will only be merged when their
640
+ inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
641
+ to only merge when it requires no extra bytes. Pass a negative
642
+ number to disable merging, appropriate for local target files.
643
+ Neighboring byte ranges will only be merged when the size of
644
+ the aggregated range is <= ``max_block``. Default is 256MB.
645
+ cache_size : int
646
+ Maximum size of LRU cache, where cache_size*record_size denotes
647
+ the total number of references that can be loaded in memory at once.
648
+ Only used for lazily loaded references.
649
+ kwargs : passed to parent class
650
+ """
651
+ super().__init__(**kwargs)
652
+ self.target = target
653
+ self.template_overrides = template_overrides
654
+ self.simple_templates = simple_templates
655
+ self.templates = {}
656
+ self.fss = {}
657
+ self._dircache = {}
658
+ self.max_gap = max_gap
659
+ self.max_block = max_block
660
+ if isinstance(fo, str):
661
+ dic = dict(
662
+ **(ref_storage_args or target_options or {}), protocol=target_protocol
663
+ )
664
+ ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
665
+ if ref_fs.isfile(fo2):
666
+ # text JSON
667
+ with fsspec.open(fo, "rb", **dic) as f:
668
+ logger.info("Read reference from URL %s", fo)
669
+ text = json.load(f)
670
+ self._process_references(text, template_overrides)
671
+ else:
672
+ # Lazy parquet refs
673
+ logger.info("Open lazy reference dict from URL %s", fo)
674
+ self.references = LazyReferenceMapper(
675
+ fo2,
676
+ fs=ref_fs,
677
+ cache_size=cache_size,
678
+ )
679
+ else:
680
+ # dictionaries
681
+ self._process_references(fo, template_overrides)
682
+ if isinstance(fs, dict):
683
+ self.fss = {
684
+ k: (
685
+ fsspec.filesystem(k.split(":", 1)[0], **opts)
686
+ if isinstance(opts, dict)
687
+ else opts
688
+ )
689
+ for k, opts in fs.items()
690
+ }
691
+ if None not in self.fss:
692
+ self.fss[None] = filesystem("file")
693
+ return
694
+ if fs is not None:
695
+ # single remote FS
696
+ remote_protocol = (
697
+ fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
698
+ )
699
+ self.fss[remote_protocol] = fs
700
+
701
+ if remote_protocol is None:
702
+ # get single protocol from any templates
703
+ for ref in self.templates.values():
704
+ if callable(ref):
705
+ ref = ref()
706
+ protocol, _ = fsspec.core.split_protocol(ref)
707
+ if protocol and protocol not in self.fss:
708
+ fs = filesystem(protocol, **(remote_options or {}))
709
+ self.fss[protocol] = fs
710
+ if remote_protocol is None:
711
+ # get single protocol from references
712
+ # TODO: warning here, since this can be very expensive?
713
+ for ref in self.references.values():
714
+ if callable(ref):
715
+ ref = ref()
716
+ if isinstance(ref, list) and ref[0]:
717
+ protocol, _ = fsspec.core.split_protocol(ref[0])
718
+ if protocol not in self.fss:
719
+ fs = filesystem(protocol, **(remote_options or {}))
720
+ self.fss[protocol] = fs
721
+ # only use first remote URL
722
+ break
723
+
724
+ if remote_protocol and remote_protocol not in self.fss:
725
+ fs = filesystem(remote_protocol, **(remote_options or {}))
726
+ self.fss[remote_protocol] = fs
727
+
728
+ self.fss[None] = fs or filesystem("file") # default one
729
+
730
+ def _cat_common(self, path, start=None, end=None):
731
+ path = self._strip_protocol(path)
732
+ logger.debug(f"cat: {path}")
733
+ try:
734
+ part = self.references[path]
735
+ except KeyError:
736
+ raise FileNotFoundError(path)
737
+ if isinstance(part, str):
738
+ part = part.encode()
739
+ if isinstance(part, bytes):
740
+ logger.debug(f"Reference: {path}, type bytes")
741
+ if part.startswith(b"base64:"):
742
+ part = base64.b64decode(part[7:])
743
+ return part, None, None
744
+
745
+ if len(part) == 1:
746
+ logger.debug(f"Reference: {path}, whole file => {part}")
747
+ url = part[0]
748
+ start1, end1 = start, end
749
+ else:
750
+ url, start0, size = part
751
+ logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
752
+ end0 = start0 + size
753
+
754
+ if start is not None:
755
+ if start >= 0:
756
+ start1 = start0 + start
757
+ else:
758
+ start1 = end0 + start
759
+ else:
760
+ start1 = start0
761
+ if end is not None:
762
+ if end >= 0:
763
+ end1 = start0 + end
764
+ else:
765
+ end1 = end0 + end
766
+ else:
767
+ end1 = end0
768
+ if url is None:
769
+ url = self.target
770
+ return url, start1, end1
771
+
772
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
773
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
774
+ if isinstance(part_or_url, bytes):
775
+ return part_or_url[start:end]
776
+ protocol, _ = split_protocol(part_or_url)
777
+ try:
778
+ await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
779
+ except Exception as e:
780
+ raise ReferenceNotReachable(path, part_or_url) from e
781
+
782
+ def cat_file(self, path, start=None, end=None, **kwargs):
783
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
784
+ if isinstance(part_or_url, bytes):
785
+ return part_or_url[start:end]
786
+ protocol, _ = split_protocol(part_or_url)
787
+ try:
788
+ return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
789
+ except Exception as e:
790
+ raise ReferenceNotReachable(path, part_or_url) from e
791
+
792
+ def pipe_file(self, path, value, **_):
793
+ """Temporarily add binary data or reference as a file"""
794
+ self.references[path] = value
795
+
796
+ async def _get_file(self, rpath, lpath, **kwargs):
797
+ if self.isdir(rpath):
798
+ return os.makedirs(lpath, exist_ok=True)
799
+ data = await self._cat_file(rpath)
800
+ with open(lpath, "wb") as f:
801
+ f.write(data)
802
+
803
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
804
+ if self.isdir(rpath):
805
+ return os.makedirs(lpath, exist_ok=True)
806
+ data = self.cat_file(rpath, **kwargs)
807
+ callback.set_size(len(data))
808
+ if isfilelike(lpath):
809
+ lpath.write(data)
810
+ else:
811
+ with open(lpath, "wb") as f:
812
+ f.write(data)
813
+ callback.absolute_update(len(data))
814
+
815
+ def get(self, rpath, lpath, recursive=False, **kwargs):
816
+ if recursive:
817
+ # trigger directory build
818
+ self.ls("")
819
+ rpath = self.expand_path(rpath, recursive=recursive)
820
+ fs = fsspec.filesystem("file", auto_mkdir=True)
821
+ targets = other_paths(rpath, lpath)
822
+ if recursive:
823
+ data = self.cat([r for r in rpath if not self.isdir(r)])
824
+ else:
825
+ data = self.cat(rpath)
826
+ for remote, local in zip(rpath, targets):
827
+ if remote in data:
828
+ fs.pipe_file(local, data[remote])
829
+
830
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
831
+ if isinstance(path, str) and recursive:
832
+ raise NotImplementedError
833
+ if isinstance(path, list) and (recursive or any("*" in p for p in path)):
834
+ raise NotImplementedError
835
+ # TODO: if references is lazy, pre-fetch all paths in batch before access
836
+ proto_dict = _protocol_groups(path, self.references)
837
+ out = {}
838
+ for proto, paths in proto_dict.items():
839
+ fs = self.fss[proto]
840
+ urls, starts, ends, valid_paths = [], [], [], []
841
+ for p in paths:
842
+ # find references or label not-found. Early exit if any not
843
+ # found and on_error is "raise"
844
+ try:
845
+ u, s, e = self._cat_common(p)
846
+ except FileNotFoundError as err:
847
+ if on_error == "raise":
848
+ raise
849
+ if on_error != "omit":
850
+ out[p] = err
851
+ else:
852
+ urls.append(u)
853
+ starts.append(s)
854
+ ends.append(e)
855
+ valid_paths.append(p)
856
+
857
+ # process references into form for merging
858
+ urls2 = []
859
+ starts2 = []
860
+ ends2 = []
861
+ paths2 = []
862
+ whole_files = set()
863
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
864
+ if isinstance(u, bytes):
865
+ # data
866
+ out[p] = u
867
+ elif s is None:
868
+ # whole file - limits are None, None, but no further
869
+ # entries take for this file
870
+ whole_files.add(u)
871
+ urls2.append(u)
872
+ starts2.append(s)
873
+ ends2.append(e)
874
+ paths2.append(p)
875
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
876
+ # second run to account for files that are to be loaded whole
877
+ if s is not None and u not in whole_files:
878
+ urls2.append(u)
879
+ starts2.append(s)
880
+ ends2.append(e)
881
+ paths2.append(p)
882
+
883
+ # merge and fetch consolidated ranges
884
+ new_paths, new_starts, new_ends = merge_offset_ranges(
885
+ list(urls2),
886
+ list(starts2),
887
+ list(ends2),
888
+ sort=True,
889
+ max_gap=self.max_gap,
890
+ max_block=self.max_block,
891
+ )
892
+ bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
893
+
894
+ # unbundle from merged bytes - simple approach
895
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
896
+ if p in out:
897
+ continue # was bytes, already handled
898
+ for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
899
+ if np == u and (ns is None or ne is None):
900
+ if isinstance(b, Exception):
901
+ out[p] = b
902
+ else:
903
+ out[p] = b[s:e]
904
+ elif np == u and s >= ns and e <= ne:
905
+ if isinstance(b, Exception):
906
+ out[p] = b
907
+ else:
908
+ out[p] = b[s - ns : (e - ne) or None]
909
+
910
+ for k, v in out.copy().items():
911
+ # these were valid references, but fetch failed, so transform exc
912
+ if isinstance(v, Exception) and k in self.references:
913
+ ex = out[k]
914
+ new_ex = ReferenceNotReachable(k, self.references[k])
915
+ new_ex.__cause__ = ex
916
+ if on_error == "raise":
917
+ raise new_ex
918
+ elif on_error != "omit":
919
+ out[k] = new_ex
920
+
921
+ if len(out) == 1 and isinstance(path, str) and "*" not in path:
922
+ return _first(out)
923
+ return out
924
+
925
+ def _process_references(self, references, template_overrides=None):
926
+ vers = references.get("version", None)
927
+ if vers is None:
928
+ self._process_references0(references)
929
+ elif vers == 1:
930
+ self._process_references1(references, template_overrides=template_overrides)
931
+ else:
932
+ raise ValueError(f"Unknown reference spec version: {vers}")
933
+ # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
934
+ # can replace with programmatic. Is it even needed for mapper interface?
935
+
936
+ def _process_references0(self, references):
937
+ """Make reference dict for Spec Version 0"""
938
+ self.references = references
939
+
940
+ def _process_references1(self, references, template_overrides=None):
941
+ if not self.simple_templates or self.templates:
942
+ import jinja2
943
+ self.references = {}
944
+ self._process_templates(references.get("templates", {}))
945
+
946
+ @lru_cache(1000)
947
+ def _render_jinja(u):
948
+ return jinja2.Template(u).render(**self.templates)
949
+
950
+ for k, v in references.get("refs", {}).items():
951
+ if isinstance(v, str):
952
+ if v.startswith("base64:"):
953
+ self.references[k] = base64.b64decode(v[7:])
954
+ self.references[k] = v
955
+ elif self.templates:
956
+ u = v[0]
957
+ if "{{" in u:
958
+ if self.simple_templates:
959
+ u = (
960
+ u.replace("{{", "{")
961
+ .replace("}}", "}")
962
+ .format(**self.templates)
963
+ )
964
+ else:
965
+ u = _render_jinja(u)
966
+ self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
967
+ else:
968
+ self.references[k] = v
969
+ self.references.update(self._process_gen(references.get("gen", [])))
970
+
971
+ def _process_templates(self, tmp):
972
+ self.templates = {}
973
+ if self.template_overrides is not None:
974
+ tmp.update(self.template_overrides)
975
+ for k, v in tmp.items():
976
+ if "{{" in v:
977
+ import jinja2
978
+
979
+ self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
980
+ temp
981
+ ).render(**kwargs)
982
+ else:
983
+ self.templates[k] = v
984
+
985
+ def _process_gen(self, gens):
986
+ out = {}
987
+ for gen in gens:
988
+ dimension = {
989
+ k: v
990
+ if isinstance(v, list)
991
+ else range(v.get("start", 0), v["stop"], v.get("step", 1))
992
+ for k, v in gen["dimensions"].items()
993
+ }
994
+ products = (
995
+ dict(zip(dimension.keys(), values))
996
+ for values in itertools.product(*dimension.values())
997
+ )
998
+ for pr in products:
999
+ import jinja2
1000
+
1001
+ key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
1002
+ url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
1003
+ if ("offset" in gen) and ("length" in gen):
1004
+ offset = int(
1005
+ jinja2.Template(gen["offset"]).render(**pr, **self.templates)
1006
+ )
1007
+ length = int(
1008
+ jinja2.Template(gen["length"]).render(**pr, **self.templates)
1009
+ )
1010
+ out[key] = [url, offset, length]
1011
+ elif ("offset" in gen) ^ ("length" in gen):
1012
+ raise ValueError(
1013
+ "Both 'offset' and 'length' are required for a "
1014
+ "reference generator entry if either is provided."
1015
+ )
1016
+ else:
1017
+ out[key] = [url]
1018
+ return out
1019
+
1020
+ def _dircache_from_items(self):
1021
+ self.dircache = {"": []}
1022
+ it = self.references.items()
1023
+ for path, part in it:
1024
+ if isinstance(part, (bytes, str)):
1025
+ size = len(part)
1026
+ elif len(part) == 1:
1027
+ size = None
1028
+ else:
1029
+ _, _, size = part
1030
+ par = path.rsplit("/", 1)[0] if "/" in path else ""
1031
+ par0 = par
1032
+ subdirs = [par0]
1033
+ while par0 and par0 not in self.dircache:
1034
+ # collect parent directories
1035
+ par0 = self._parent(par0)
1036
+ subdirs.append(par0)
1037
+
1038
+ subdirs.reverse()
1039
+ for parent, child in zip(subdirs, subdirs[1:]):
1040
+ # register newly discovered directories
1041
+ assert child not in self.dircache
1042
+ assert parent in self.dircache
1043
+ self.dircache[parent].append(
1044
+ {"name": child, "type": "directory", "size": 0}
1045
+ )
1046
+ self.dircache[child] = []
1047
+
1048
+ self.dircache[par].append({"name": path, "type": "file", "size": size})
1049
+
1050
+ def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
1051
+ data = self.cat_file(path) # load whole chunk into memory
1052
+ return io.BytesIO(data)
1053
+
1054
+ def ls(self, path, detail=True, **kwargs):
1055
+ path = self._strip_protocol(path)
1056
+ if isinstance(self.references, LazyReferenceMapper):
1057
+ try:
1058
+ return self.references.ls(path, detail)
1059
+ except KeyError:
1060
+ pass
1061
+ raise FileNotFoundError(f"'{path}' is not a known key")
1062
+ if not self.dircache:
1063
+ self._dircache_from_items()
1064
+ out = self._ls_from_cache(path)
1065
+ if out is None:
1066
+ raise FileNotFoundError(path)
1067
+ if detail:
1068
+ return out
1069
+ return [o["name"] for o in out]
1070
+
1071
+ def exists(self, path, **kwargs): # overwrite auto-sync version
1072
+ return self.isdir(path) or self.isfile(path)
1073
+
1074
+ def isdir(self, path): # overwrite auto-sync version
1075
+ if self.dircache:
1076
+ return path in self.dircache
1077
+ elif isinstance(self.references, LazyReferenceMapper):
1078
+ return path in self.references.listdir("")
1079
+ else:
1080
+ # this may be faster than building dircache for single calls, but
1081
+ # by looping will be slow for many calls; could cache it?
1082
+ return any(_.startswith(f"{path}/") for _ in self.references)
1083
+
1084
+ def isfile(self, path): # overwrite auto-sync version
1085
+ return path in self.references
1086
+
1087
+ async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
1088
+ return self.ls(path, detail, **kwargs)
1089
+
1090
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
1091
+ if withdirs:
1092
+ return super().find(
1093
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
1094
+ )
1095
+ if path:
1096
+ path = self._strip_protocol(path)
1097
+ r = sorted(k for k in self.references if k.startswith(path))
1098
+ else:
1099
+ r = sorted(self.references)
1100
+ if detail:
1101
+ if not self.dircache:
1102
+ self._dircache_from_items()
1103
+ return {k: self._ls_from_cache(k)[0] for k in r}
1104
+ else:
1105
+ return r
1106
+
1107
+ def info(self, path, **kwargs):
1108
+ out = self.references.get(path)
1109
+ if out is not None:
1110
+ if isinstance(out, (str, bytes)):
1111
+ # decode base64 here
1112
+ return {"name": path, "type": "file", "size": len(out)}
1113
+ elif len(out) > 1:
1114
+ return {"name": path, "type": "file", "size": out[2]}
1115
+ else:
1116
+ out0 = [{"name": path, "type": "file", "size": None}]
1117
+ else:
1118
+ out = self.ls(path, True)
1119
+ out0 = [o for o in out if o["name"] == path]
1120
+ if not out0:
1121
+ return {"name": path, "type": "directory", "size": 0}
1122
+ if out0[0]["size"] is None:
1123
+ # if this is a whole remote file, update size using remote FS
1124
+ prot, _ = split_protocol(self.references[path][0])
1125
+ out0[0]["size"] = self.fss[prot].size(self.references[path][0])
1126
+ return out0[0]
1127
+
1128
+ async def _info(self, path, **kwargs): # calls fast sync code
1129
+ return self.info(path)
1130
+
1131
+ async def _rm_file(self, path, **kwargs):
1132
+ self.references.pop(
1133
+ path, None
1134
+ ) # ignores FileNotFound, just as well for directories
1135
+ self.dircache.clear() # this is a bit heavy handed
1136
+
1137
+ async def _pipe_file(self, path, data):
1138
+ # can be str or bytes
1139
+ self.references[path] = data
1140
+ self.dircache.clear() # this is a bit heavy handed
1141
+
1142
+ async def _put_file(self, lpath, rpath, **kwargs):
1143
+ # puts binary
1144
+ with open(lpath, "rb") as f:
1145
+ self.references[rpath] = f.read()
1146
+ self.dircache.clear() # this is a bit heavy handed
1147
+
1148
+ def save_json(self, url, **storage_options):
1149
+ """Write modified references into new location"""
1150
+ out = {}
1151
+ for k, v in self.references.items():
1152
+ if isinstance(v, bytes):
1153
+ try:
1154
+ out[k] = v.decode("ascii")
1155
+ except UnicodeDecodeError:
1156
+ out[k] = (b"base64:" + base64.b64encode(v)).decode()
1157
+ else:
1158
+ out[k] = v
1159
+ with fsspec.open(url, "wb", **storage_options) as f:
1160
+ f.write(json.dumps({"version": 1, "refs": out}).encode())
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import os
4
+ import types
5
+ import uuid
6
+ from stat import S_ISDIR, S_ISLNK
7
+
8
+ import paramiko
9
+
10
+ from .. import AbstractFileSystem
11
+ from ..utils import infer_storage_options
12
+
13
+ logger = logging.getLogger("fsspec.sftp")
14
+
15
+
16
+ class SFTPFileSystem(AbstractFileSystem):
17
+ """Files over SFTP/SSH
18
+
19
+ Peer-to-peer filesystem over SSH using paramiko.
20
+
21
+ Note: if using this with the ``open`` or ``open_files``, with full URLs,
22
+ there is no way to tell if a path is relative, so all paths are assumed
23
+ to be absolute.
24
+ """
25
+
26
+ protocol = "sftp", "ssh"
27
+
28
+ def __init__(self, host, **ssh_kwargs):
29
+ """
30
+
31
+ Parameters
32
+ ----------
33
+ host: str
34
+ Hostname or IP as a string
35
+ temppath: str
36
+ Location on the server to put files, when within a transaction
37
+ ssh_kwargs: dict
38
+ Parameters passed on to connection. See details in
39
+ https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
40
+ May include port, username, password...
41
+ """
42
+ if self._cached:
43
+ return
44
+ super().__init__(**ssh_kwargs)
45
+ self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
46
+ self.host = host
47
+ self.ssh_kwargs = ssh_kwargs
48
+ self._connect()
49
+
50
+ def _connect(self):
51
+ logger.debug("Connecting to SFTP server %s", self.host)
52
+ self.client = paramiko.SSHClient()
53
+ self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
54
+ self.client.connect(self.host, **self.ssh_kwargs)
55
+ self.ftp = self.client.open_sftp()
56
+
57
+ @classmethod
58
+ def _strip_protocol(cls, path):
59
+ return infer_storage_options(path)["path"]
60
+
61
+ @staticmethod
62
+ def _get_kwargs_from_urls(urlpath):
63
+ out = infer_storage_options(urlpath)
64
+ out.pop("path", None)
65
+ out.pop("protocol", None)
66
+ return out
67
+
68
+ def mkdir(self, path, create_parents=True, mode=511):
69
+ logger.debug("Creating folder %s", path)
70
+ if self.exists(path):
71
+ raise FileExistsError(f"File exists: {path}")
72
+
73
+ if create_parents:
74
+ self.makedirs(path)
75
+ else:
76
+ self.ftp.mkdir(path, mode)
77
+
78
+ def makedirs(self, path, exist_ok=False, mode=511):
79
+ if self.exists(path) and not exist_ok:
80
+ raise FileExistsError(f"File exists: {path}")
81
+
82
+ parts = path.split("/")
83
+ new_path = "/" if path[:1] == "/" else ""
84
+
85
+ for part in parts:
86
+ if part:
87
+ new_path = f"{new_path}/{part}" if new_path else part
88
+ if not self.exists(new_path):
89
+ self.ftp.mkdir(new_path, mode)
90
+
91
+ def rmdir(self, path):
92
+ logger.debug("Removing folder %s", path)
93
+ self.ftp.rmdir(path)
94
+
95
+ def info(self, path):
96
+ stat = self._decode_stat(self.ftp.stat(path))
97
+ stat["name"] = path
98
+ return stat
99
+
100
+ @staticmethod
101
+ def _decode_stat(stat, parent_path=None):
102
+ if S_ISDIR(stat.st_mode):
103
+ t = "directory"
104
+ elif S_ISLNK(stat.st_mode):
105
+ t = "link"
106
+ else:
107
+ t = "file"
108
+ out = {
109
+ "name": "",
110
+ "size": stat.st_size,
111
+ "type": t,
112
+ "uid": stat.st_uid,
113
+ "gid": stat.st_gid,
114
+ "time": datetime.datetime.fromtimestamp(
115
+ stat.st_atime, tz=datetime.timezone.utc
116
+ ),
117
+ "mtime": datetime.datetime.fromtimestamp(
118
+ stat.st_mtime, tz=datetime.timezone.utc
119
+ ),
120
+ }
121
+ if parent_path:
122
+ out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
123
+ return out
124
+
125
+ def ls(self, path, detail=False):
126
+ logger.debug("Listing folder %s", path)
127
+ stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
128
+ if detail:
129
+ return stats
130
+ else:
131
+ paths = [stat["name"] for stat in stats]
132
+ return sorted(paths)
133
+
134
+ def put(self, lpath, rpath, callback=None, **kwargs):
135
+ logger.debug("Put file %s into %s", lpath, rpath)
136
+ self.ftp.put(lpath, rpath)
137
+
138
+ def get_file(self, rpath, lpath, **kwargs):
139
+ if self.isdir(rpath):
140
+ os.makedirs(lpath, exist_ok=True)
141
+ else:
142
+ self.ftp.get(self._strip_protocol(rpath), lpath)
143
+
144
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
145
+ """
146
+ block_size: int or None
147
+ If 0, no buffering, if 1, line buffering, if >1, buffer that many
148
+ bytes, if None use default from paramiko.
149
+ """
150
+ logger.debug("Opening file %s", path)
151
+ if kwargs.get("autocommit", True) is False:
152
+ # writes to temporary file, move on commit
153
+ path2 = "/".join([self.temppath, str(uuid.uuid4())])
154
+ f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
155
+ f.temppath = path2
156
+ f.targetpath = path
157
+ f.fs = self
158
+ f.commit = types.MethodType(commit_a_file, f)
159
+ f.discard = types.MethodType(discard_a_file, f)
160
+ else:
161
+ f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
162
+ return f
163
+
164
+ def _rm(self, path):
165
+ if self.isdir(path):
166
+ self.ftp.rmdir(path)
167
+ else:
168
+ self.ftp.remove(path)
169
+
170
+ def mv(self, old, new):
171
+ logger.debug("Renaming %s into %s", old, new)
172
+ self.ftp.posix_rename(old, new)
173
+
174
+
175
+ def commit_a_file(self):
176
+ self.fs.mv(self.temppath, self.targetpath)
177
+
178
+
179
+ def discard_a_file(self):
180
+ self.fs._rm(self.temppath)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains SMBFileSystem class responsible for handling access to
3
+ Windows Samba network shares by using package smbprotocol
4
+ """
5
+
6
+ import datetime
7
+ import uuid
8
+ from stat import S_ISDIR, S_ISLNK
9
+
10
+ import smbclient
11
+
12
+ from .. import AbstractFileSystem
13
+ from ..utils import infer_storage_options
14
+
15
+ # ! pylint: disable=bad-continuation
16
+
17
+
18
+ class SMBFileSystem(AbstractFileSystem):
19
+ """Allow reading and writing to Windows and Samba network shares.
20
+
21
+ When using `fsspec.open()` for getting a file-like object the URI
22
+ should be specified as this format:
23
+ ``smb://workgroup;user:password@server:port/share/folder/file.csv``.
24
+
25
+ Example::
26
+
27
+ >>> import fsspec
28
+ >>> with fsspec.open(
29
+ ... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
30
+ ... ) as smbfile:
31
+ ... df = pd.read_csv(smbfile, sep='|', header=None)
32
+
33
+ Note that you need to pass in a valid hostname or IP address for the host
34
+ component of the URL. Do not use the Windows/NetBIOS machine name for the
35
+ host component.
36
+
37
+ The first component of the path in the URL points to the name of the shared
38
+ folder. Subsequent path components will point to the directory/folder/file.
39
+
40
+ The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
41
+ optional.
42
+
43
+ .. note::
44
+
45
+ For working this source require `smbprotocol`_ to be installed, e.g.::
46
+
47
+ $ pip install smbprotocol
48
+ # or
49
+ # pip install smbprotocol[kerberos]
50
+
51
+ .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
52
+
53
+ Note: if using this with the ``open`` or ``open_files``, with full URLs,
54
+ there is no way to tell if a path is relative, so all paths are assumed
55
+ to be absolute.
56
+ """
57
+
58
+ protocol = "smb"
59
+
60
+ # pylint: disable=too-many-arguments
61
+ def __init__(
62
+ self,
63
+ host,
64
+ port=None,
65
+ username=None,
66
+ password=None,
67
+ timeout=60,
68
+ encrypt=None,
69
+ share_access=None,
70
+ register_session_retries=5,
71
+ **kwargs,
72
+ ):
73
+ """
74
+ You can use _get_kwargs_from_urls to get some kwargs from
75
+ a reasonable SMB url.
76
+
77
+ Authentication will be anonymous or integrated if username/password are not
78
+ given.
79
+
80
+ Parameters
81
+ ----------
82
+ host: str
83
+ The remote server name/ip to connect to
84
+ port: int or None
85
+ Port to connect with. Usually 445, sometimes 139.
86
+ username: str or None
87
+ Username to connect with. Required if Kerberos auth is not being used.
88
+ password: str or None
89
+ User's password on the server, if using username
90
+ timeout: int
91
+ Connection timeout in seconds
92
+ encrypt: bool
93
+ Whether to force encryption or not, once this has been set to True
94
+ the session cannot be changed back to False.
95
+ share_access: str or None
96
+ Specifies the default access applied to file open operations
97
+ performed with this file system object.
98
+ This affects whether other processes can concurrently open a handle
99
+ to the same file.
100
+
101
+ - None (the default): exclusively locks the file until closed.
102
+ - 'r': Allow other handles to be opened with read access.
103
+ - 'w': Allow other handles to be opened with write access.
104
+ - 'd': Allow other handles to be opened with delete access.
105
+ """
106
+ super().__init__(**kwargs)
107
+ self.host = host
108
+ self.port = port
109
+ self.username = username
110
+ self.password = password
111
+ self.timeout = timeout
112
+ self.encrypt = encrypt
113
+ self.temppath = kwargs.pop("temppath", "")
114
+ self.share_access = share_access
115
+ self.register_session_retries = register_session_retries
116
+ self._connect()
117
+
118
+ @property
119
+ def _port(self):
120
+ return 445 if self.port is None else self.port
121
+
122
+ def _connect(self):
123
+ import time
124
+
125
+ for _ in range(self.register_session_retries):
126
+ try:
127
+ smbclient.register_session(
128
+ self.host,
129
+ username=self.username,
130
+ password=self.password,
131
+ port=self._port,
132
+ encrypt=self.encrypt,
133
+ connection_timeout=self.timeout,
134
+ )
135
+ break
136
+ except Exception:
137
+ time.sleep(0.1)
138
+
139
+ @classmethod
140
+ def _strip_protocol(cls, path):
141
+ return infer_storage_options(path)["path"]
142
+
143
+ @staticmethod
144
+ def _get_kwargs_from_urls(path):
145
+ # smb://workgroup;user:password@host:port/share/folder/file.csv
146
+ out = infer_storage_options(path)
147
+ out.pop("path", None)
148
+ out.pop("protocol", None)
149
+ return out
150
+
151
+ def mkdir(self, path, create_parents=True, **kwargs):
152
+ wpath = _as_unc_path(self.host, path)
153
+ if create_parents:
154
+ smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
155
+ else:
156
+ smbclient.mkdir(wpath, port=self._port, **kwargs)
157
+
158
+ def makedirs(self, path, exist_ok=False):
159
+ if _share_has_path(path):
160
+ wpath = _as_unc_path(self.host, path)
161
+ smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
162
+
163
+ def rmdir(self, path):
164
+ if _share_has_path(path):
165
+ wpath = _as_unc_path(self.host, path)
166
+ smbclient.rmdir(wpath, port=self._port)
167
+
168
+ def info(self, path, **kwargs):
169
+ wpath = _as_unc_path(self.host, path)
170
+ stats = smbclient.stat(wpath, port=self._port, **kwargs)
171
+ if S_ISDIR(stats.st_mode):
172
+ stype = "directory"
173
+ elif S_ISLNK(stats.st_mode):
174
+ stype = "link"
175
+ else:
176
+ stype = "file"
177
+ res = {
178
+ "name": path + "/" if stype == "directory" else path,
179
+ "size": stats.st_size,
180
+ "type": stype,
181
+ "uid": stats.st_uid,
182
+ "gid": stats.st_gid,
183
+ "time": stats.st_atime,
184
+ "mtime": stats.st_mtime,
185
+ }
186
+ return res
187
+
188
+ def created(self, path):
189
+ """Return the created timestamp of a file as a datetime.datetime"""
190
+ wpath = _as_unc_path(self.host, path)
191
+ stats = smbclient.stat(wpath, port=self._port)
192
+ return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
193
+
194
+ def modified(self, path):
195
+ """Return the modified timestamp of a file as a datetime.datetime"""
196
+ wpath = _as_unc_path(self.host, path)
197
+ stats = smbclient.stat(wpath, port=self._port)
198
+ return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
199
+
200
+ def ls(self, path, detail=True, **kwargs):
201
+ unc = _as_unc_path(self.host, path)
202
+ listed = smbclient.listdir(unc, port=self._port, **kwargs)
203
+ dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
204
+ if detail:
205
+ dirs = [self.info(d) for d in dirs]
206
+ return dirs
207
+
208
+ # pylint: disable=too-many-arguments
209
+ def _open(
210
+ self,
211
+ path,
212
+ mode="rb",
213
+ block_size=-1,
214
+ autocommit=True,
215
+ cache_options=None,
216
+ **kwargs,
217
+ ):
218
+ """
219
+ block_size: int or None
220
+ If 0, no buffering, 1, line buffering, >1, buffer that many bytes
221
+
222
+ Notes
223
+ -----
224
+ By specifying 'share_access' in 'kwargs' it is possible to override the
225
+ default shared access setting applied in the constructor of this object.
226
+ """
227
+ bls = block_size if block_size is not None and block_size >= 0 else -1
228
+ wpath = _as_unc_path(self.host, path)
229
+ share_access = kwargs.pop("share_access", self.share_access)
230
+ if "w" in mode and autocommit is False:
231
+ temp = _as_temp_path(self.host, path, self.temppath)
232
+ return SMBFileOpener(
233
+ wpath, temp, mode, port=self._port, block_size=bls, **kwargs
234
+ )
235
+ return smbclient.open_file(
236
+ wpath,
237
+ mode,
238
+ buffering=bls,
239
+ share_access=share_access,
240
+ port=self._port,
241
+ **kwargs,
242
+ )
243
+
244
+ def copy(self, path1, path2, **kwargs):
245
+ """Copy within two locations in the same filesystem"""
246
+ wpath1 = _as_unc_path(self.host, path1)
247
+ wpath2 = _as_unc_path(self.host, path2)
248
+ smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
249
+
250
+ def _rm(self, path):
251
+ if _share_has_path(path):
252
+ wpath = _as_unc_path(self.host, path)
253
+ stats = smbclient.stat(wpath, port=self._port)
254
+ if S_ISDIR(stats.st_mode):
255
+ smbclient.rmdir(wpath, port=self._port)
256
+ else:
257
+ smbclient.remove(wpath, port=self._port)
258
+
259
+ def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
260
+ wpath1 = _as_unc_path(self.host, path1)
261
+ wpath2 = _as_unc_path(self.host, path2)
262
+ smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
263
+
264
+
265
+ def _as_unc_path(host, path):
266
+ rpath = path.replace("/", "\\")
267
+ unc = f"\\\\{host}{rpath}"
268
+ return unc
269
+
270
+
271
+ def _as_temp_path(host, path, temppath):
272
+ share = path.split("/")[1]
273
+ temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
274
+ unc = _as_unc_path(host, temp_file)
275
+ return unc
276
+
277
+
278
+ def _share_has_path(path):
279
+ parts = path.count("/")
280
+ if path.endswith("/"):
281
+ return parts > 2
282
+ return parts > 1
283
+
284
+
285
+ class SMBFileOpener:
286
+ """writes to remote temporary file, move on commit"""
287
+
288
+ def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
289
+ self.path = path
290
+ self.temp = temp
291
+ self.mode = mode
292
+ self.block_size = block_size
293
+ self.kwargs = kwargs
294
+ self.smbfile = None
295
+ self._incontext = False
296
+ self.port = port
297
+ self._open()
298
+
299
+ def _open(self):
300
+ if self.smbfile is None or self.smbfile.closed:
301
+ self.smbfile = smbclient.open_file(
302
+ self.temp,
303
+ self.mode,
304
+ port=self.port,
305
+ buffering=self.block_size,
306
+ **self.kwargs,
307
+ )
308
+
309
+ def commit(self):
310
+ """Move temp file to definitive on success."""
311
+ # TODO: use transaction support in SMB protocol
312
+ smbclient.replace(self.temp, self.path, port=self.port)
313
+
314
+ def discard(self):
315
+ """Remove the temp file on failure."""
316
+ smbclient.remove(self.temp, port=self.port)
317
+
318
+ def __fspath__(self):
319
+ return self.path
320
+
321
+ def __iter__(self):
322
+ return self.smbfile.__iter__()
323
+
324
+ def __getattr__(self, item):
325
+ return getattr(self.smbfile, item)
326
+
327
+ def __enter__(self):
328
+ self._incontext = True
329
+ return self.smbfile.__enter__()
330
+
331
+ def __exit__(self, exc_type, exc_value, traceback):
332
+ self._incontext = False
333
+ self.smbfile.__exit__(exc_type, exc_value, traceback)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import tarfile
3
+
4
+ import fsspec
5
+ from fsspec.archive import AbstractArchiveFileSystem
6
+ from fsspec.compression import compr
7
+ from fsspec.utils import infer_compression
8
+
9
+ typemap = {b"0": "file", b"5": "directory"}
10
+
11
+ logger = logging.getLogger("tar")
12
+
13
+
14
+ class TarFileSystem(AbstractArchiveFileSystem):
15
+ """Compressed Tar archives as a file-system (read-only)
16
+
17
+ Supports the following formats:
18
+ tar.gz, tar.bz2, tar.xz
19
+ """
20
+
21
+ root_marker = ""
22
+ protocol = "tar"
23
+ cachable = False
24
+
25
+ def __init__(
26
+ self,
27
+ fo="",
28
+ index_store=None,
29
+ target_options=None,
30
+ target_protocol=None,
31
+ compression=None,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+ target_options = target_options or {}
36
+
37
+ if isinstance(fo, str):
38
+ self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
39
+ fo = self.of.open() # keep the reference
40
+
41
+ # Try to infer compression.
42
+ if compression is None:
43
+ name = None
44
+
45
+ # Try different ways to get hold of the filename. `fo` might either
46
+ # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
47
+ # `fsspec.AbstractFileSystem` instance.
48
+ try:
49
+ # Amended io.BufferedReader or similar.
50
+ # This uses a "protocol extension" where original filenames are
51
+ # propagated to archive-like filesystems in order to let them
52
+ # infer the right compression appropriately.
53
+ if hasattr(fo, "original"):
54
+ name = fo.original
55
+
56
+ # fsspec.LocalFileOpener
57
+ elif hasattr(fo, "path"):
58
+ name = fo.path
59
+
60
+ # io.BufferedReader
61
+ elif hasattr(fo, "name"):
62
+ name = fo.name
63
+
64
+ # fsspec.AbstractFileSystem
65
+ elif hasattr(fo, "info"):
66
+ name = fo.info()["name"]
67
+
68
+ except Exception as ex:
69
+ logger.warning(
70
+ f"Unable to determine file name, not inferring compression: {ex}"
71
+ )
72
+
73
+ if name is not None:
74
+ compression = infer_compression(name)
75
+ logger.info(f"Inferred compression {compression} from file name {name}")
76
+
77
+ if compression is not None:
78
+ # TODO: tarfile already implements compression with modes like "'r:gz'",
79
+ # but then would seek to offset in the file work?
80
+ fo = compr[compression](fo)
81
+
82
+ self._fo_ref = fo
83
+ self.fo = fo # the whole instance is a context
84
+ self.tar = tarfile.TarFile(fileobj=self.fo)
85
+ self.dir_cache = None
86
+
87
+ self.index_store = index_store
88
+ self.index = None
89
+ self._index()
90
+
91
+ def _index(self):
92
+ # TODO: load and set saved index, if exists
93
+ out = {}
94
+ for ti in self.tar:
95
+ info = ti.get_info()
96
+ info["type"] = typemap.get(info["type"], "file")
97
+ name = ti.get_info()["name"].rstrip("/")
98
+ out[name] = (info, ti.offset_data)
99
+
100
+ self.index = out
101
+ # TODO: save index to self.index_store here, if set
102
+
103
+ def _get_dirs(self):
104
+ if self.dir_cache is not None:
105
+ return
106
+
107
+ # This enables ls to get directories as children as well as files
108
+ self.dir_cache = {
109
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
110
+ for dirname in self._all_dirnames(self.tar.getnames())
111
+ }
112
+ for member in self.tar.getmembers():
113
+ info = member.get_info()
114
+ info["name"] = info["name"].rstrip("/")
115
+ info["type"] = typemap.get(info["type"], "file")
116
+ self.dir_cache[info["name"]] = info
117
+
118
+ def _open(self, path, mode="rb", **kwargs):
119
+ if mode != "rb":
120
+ raise ValueError("Read-only filesystem implementation")
121
+ details, offset = self.index[path]
122
+ if details["type"] != "file":
123
+ raise ValueError("Can only handle regular files")
124
+ return self.tar.extractfile(path)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
2
+
3
+ import logging
4
+ import os
5
+ import secrets
6
+ import shutil
7
+ import tempfile
8
+ import uuid
9
+ from contextlib import suppress
10
+ from urllib.parse import quote
11
+
12
+ import requests
13
+
14
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
15
+ from ..utils import infer_storage_options, tokenize
16
+
17
+ logger = logging.getLogger("webhdfs")
18
+
19
+
20
+ class WebHDFS(AbstractFileSystem):
21
+ """
22
+ Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
23
+
24
+ Four auth mechanisms are supported:
25
+
26
+ insecure: no auth is done, and the user is assumed to be whoever they
27
+ say they are (parameter ``user``), or a predefined value such as
28
+ "dr.who" if not given
29
+ spnego: when kerberos authentication is enabled, auth is negotiated by
30
+ requests_kerberos https://github.com/requests/requests-kerberos .
31
+ This establishes a session based on existing kinit login and/or
32
+ specified principal/password; parameters are passed with ``kerb_kwargs``
33
+ token: uses an existing Hadoop delegation token from another secured
34
+ service. Indeed, this client can also generate such tokens when
35
+ not insecure. Note that tokens expire, but can be renewed (by a
36
+ previously specified user) and may allow for proxying.
37
+ basic-auth: used when both parameter ``user`` and parameter ``password``
38
+ are provided.
39
+
40
+ """
41
+
42
+ tempdir = str(tempfile.gettempdir())
43
+ protocol = "webhdfs", "webHDFS"
44
+
45
+ def __init__(
46
+ self,
47
+ host,
48
+ port=50070,
49
+ kerberos=False,
50
+ token=None,
51
+ user=None,
52
+ password=None,
53
+ proxy_to=None,
54
+ kerb_kwargs=None,
55
+ data_proxy=None,
56
+ use_https=False,
57
+ session_cert=None,
58
+ session_verify=True,
59
+ **kwargs,
60
+ ):
61
+ """
62
+ Parameters
63
+ ----------
64
+ host: str
65
+ Name-node address
66
+ port: int
67
+ Port for webHDFS
68
+ kerberos: bool
69
+ Whether to authenticate with kerberos for this connection
70
+ token: str or None
71
+ If given, use this token on every call to authenticate. A user
72
+ and user-proxy may be encoded in the token and should not be also
73
+ given
74
+ user: str or None
75
+ If given, assert the user name to connect with
76
+ password: str or None
77
+ If given, assert the password to use for basic auth. If password
78
+ is provided, user must be provided also
79
+ proxy_to: str or None
80
+ If given, the user has the authority to proxy, and this value is
81
+ the user in who's name actions are taken
82
+ kerb_kwargs: dict
83
+ Any extra arguments for HTTPKerberosAuth, see
84
+ `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
85
+ data_proxy: dict, callable or None
86
+ If given, map data-node addresses. This can be necessary if the
87
+ HDFS cluster is behind a proxy, running on Docker or otherwise has
88
+ a mismatch between the host-names given by the name-node and the
89
+ address by which to refer to them from the client. If a dict,
90
+ maps host names ``host->data_proxy[host]``; if a callable, full
91
+ URLs are passed, and function must conform to
92
+ ``url->data_proxy(url)``.
93
+ use_https: bool
94
+ Whether to connect to the Name-node using HTTPS instead of HTTP
95
+ session_cert: str or Tuple[str, str] or None
96
+ Path to a certificate file, or tuple of (cert, key) files to use
97
+ for the requests.Session
98
+ session_verify: str, bool or None
99
+ Path to a certificate file to use for verifying the requests.Session.
100
+ kwargs
101
+ """
102
+ if self._cached:
103
+ return
104
+ super().__init__(**kwargs)
105
+ self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa
106
+ self.kerb = kerberos
107
+ self.kerb_kwargs = kerb_kwargs or {}
108
+ self.pars = {}
109
+ self.proxy = data_proxy or {}
110
+ if token is not None:
111
+ if user is not None or proxy_to is not None:
112
+ raise ValueError(
113
+ "If passing a delegation token, must not set "
114
+ "user or proxy_to, as these are encoded in the"
115
+ " token"
116
+ )
117
+ self.pars["delegation"] = token
118
+ self.user = user
119
+ self.password = password
120
+
121
+ if password is not None:
122
+ if user is None:
123
+ raise ValueError(
124
+ "If passing a password, the user must also be"
125
+ "set in order to set up the basic-auth"
126
+ )
127
+ else:
128
+ if user is not None:
129
+ self.pars["user.name"] = user
130
+
131
+ if proxy_to is not None:
132
+ self.pars["doas"] = proxy_to
133
+ if kerberos and user is not None:
134
+ raise ValueError(
135
+ "If using Kerberos auth, do not specify the "
136
+ "user, this is handled by kinit."
137
+ )
138
+
139
+ self.session_cert = session_cert
140
+ self.session_verify = session_verify
141
+
142
+ self._connect()
143
+
144
+ self._fsid = f"webhdfs_{tokenize(host, port)}"
145
+
146
+ @property
147
+ def fsid(self):
148
+ return self._fsid
149
+
150
+ def _connect(self):
151
+ self.session = requests.Session()
152
+
153
+ if self.session_cert:
154
+ self.session.cert = self.session_cert
155
+
156
+ self.session.verify = self.session_verify
157
+
158
+ if self.kerb:
159
+ from requests_kerberos import HTTPKerberosAuth
160
+
161
+ self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
162
+
163
+ if self.user is not None and self.password is not None:
164
+ from requests.auth import HTTPBasicAuth
165
+
166
+ self.session.auth = HTTPBasicAuth(self.user, self.password)
167
+
168
+ def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
169
+ url = self._apply_proxy(self.url + quote(path or "", safe="/="))
170
+ args = kwargs.copy()
171
+ args.update(self.pars)
172
+ args["op"] = op.upper()
173
+ logger.debug("sending %s with %s", url, method)
174
+ out = self.session.request(
175
+ method=method.upper(),
176
+ url=url,
177
+ params=args,
178
+ data=data,
179
+ allow_redirects=redirect,
180
+ )
181
+ if out.status_code in [400, 401, 403, 404, 500]:
182
+ try:
183
+ err = out.json()
184
+ msg = err["RemoteException"]["message"]
185
+ exp = err["RemoteException"]["exception"]
186
+ except (ValueError, KeyError):
187
+ pass
188
+ else:
189
+ if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
190
+ raise ValueError(msg)
191
+ elif exp in ["SecurityException", "AccessControlException"]:
192
+ raise PermissionError(msg)
193
+ elif exp in ["FileNotFoundException"]:
194
+ raise FileNotFoundError(msg)
195
+ else:
196
+ raise RuntimeError(msg)
197
+ out.raise_for_status()
198
+ return out
199
+
200
+ def _open(
201
+ self,
202
+ path,
203
+ mode="rb",
204
+ block_size=None,
205
+ autocommit=True,
206
+ replication=None,
207
+ permissions=None,
208
+ **kwargs,
209
+ ):
210
+ """
211
+
212
+ Parameters
213
+ ----------
214
+ path: str
215
+ File location
216
+ mode: str
217
+ 'rb', 'wb', etc.
218
+ block_size: int
219
+ Client buffer size for read-ahead or write buffer
220
+ autocommit: bool
221
+ If False, writes to temporary file that only gets put in final
222
+ location upon commit
223
+ replication: int
224
+ Number of copies of file on the cluster, write mode only
225
+ permissions: str or int
226
+ posix permissions, write mode only
227
+ kwargs
228
+
229
+ Returns
230
+ -------
231
+ WebHDFile instance
232
+ """
233
+ block_size = block_size or self.blocksize
234
+ return WebHDFile(
235
+ self,
236
+ path,
237
+ mode=mode,
238
+ block_size=block_size,
239
+ tempdir=self.tempdir,
240
+ autocommit=autocommit,
241
+ replication=replication,
242
+ permissions=permissions,
243
+ )
244
+
245
+ @staticmethod
246
+ def _process_info(info):
247
+ info["type"] = info["type"].lower()
248
+ info["size"] = info["length"]
249
+ return info
250
+
251
+ @classmethod
252
+ def _strip_protocol(cls, path):
253
+ return infer_storage_options(path)["path"]
254
+
255
+ @staticmethod
256
+ def _get_kwargs_from_urls(urlpath):
257
+ out = infer_storage_options(urlpath)
258
+ out.pop("path", None)
259
+ out.pop("protocol", None)
260
+ if "username" in out:
261
+ out["user"] = out.pop("username")
262
+ return out
263
+
264
+ def info(self, path):
265
+ out = self._call("GETFILESTATUS", path=path)
266
+ info = out.json()["FileStatus"]
267
+ info["name"] = path
268
+ return self._process_info(info)
269
+
270
+ def ls(self, path, detail=False):
271
+ out = self._call("LISTSTATUS", path=path)
272
+ infos = out.json()["FileStatuses"]["FileStatus"]
273
+ for info in infos:
274
+ self._process_info(info)
275
+ info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
276
+ if detail:
277
+ return sorted(infos, key=lambda i: i["name"])
278
+ else:
279
+ return sorted(info["name"] for info in infos)
280
+
281
+ def content_summary(self, path):
282
+ """Total numbers of files, directories and bytes under path"""
283
+ out = self._call("GETCONTENTSUMMARY", path=path)
284
+ return out.json()["ContentSummary"]
285
+
286
+ def ukey(self, path):
287
+ """Checksum info of file, giving method and result"""
288
+ out = self._call("GETFILECHECKSUM", path=path, redirect=False)
289
+ if "Location" in out.headers:
290
+ location = self._apply_proxy(out.headers["Location"])
291
+ out2 = self.session.get(location)
292
+ out2.raise_for_status()
293
+ return out2.json()["FileChecksum"]
294
+ else:
295
+ out.raise_for_status()
296
+ return out.json()["FileChecksum"]
297
+
298
+ def home_directory(self):
299
+ """Get user's home directory"""
300
+ out = self._call("GETHOMEDIRECTORY")
301
+ return out.json()["Path"]
302
+
303
+ def get_delegation_token(self, renewer=None):
304
+ """Retrieve token which can give the same authority to other uses
305
+
306
+ Parameters
307
+ ----------
308
+ renewer: str or None
309
+ User who may use this token; if None, will be current user
310
+ """
311
+ if renewer:
312
+ out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
313
+ else:
314
+ out = self._call("GETDELEGATIONTOKEN")
315
+ t = out.json()["Token"]
316
+ if t is None:
317
+ raise ValueError("No token available for this user/security context")
318
+ return t["urlString"]
319
+
320
+ def renew_delegation_token(self, token):
321
+ """Make token live longer. Returns new expiry time"""
322
+ out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
323
+ return out.json()["long"]
324
+
325
+ def cancel_delegation_token(self, token):
326
+ """Stop the token from being useful"""
327
+ self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
328
+
329
+ def chmod(self, path, mod):
330
+ """Set the permission at path
331
+
332
+ Parameters
333
+ ----------
334
+ path: str
335
+ location to set (file or directory)
336
+ mod: str or int
337
+ posix epresentation or permission, give as oct string, e.g, '777'
338
+ or 0o777
339
+ """
340
+ self._call("SETPERMISSION", method="put", path=path, permission=mod)
341
+
342
+ def chown(self, path, owner=None, group=None):
343
+ """Change owning user and/or group"""
344
+ kwargs = {}
345
+ if owner is not None:
346
+ kwargs["owner"] = owner
347
+ if group is not None:
348
+ kwargs["group"] = group
349
+ self._call("SETOWNER", method="put", path=path, **kwargs)
350
+
351
+ def set_replication(self, path, replication):
352
+ """
353
+ Set file replication factor
354
+
355
+ Parameters
356
+ ----------
357
+ path: str
358
+ File location (not for directories)
359
+ replication: int
360
+ Number of copies of file on the cluster. Should be smaller than
361
+ number of data nodes; normally 3 on most systems.
362
+ """
363
+ self._call("SETREPLICATION", path=path, method="put", replication=replication)
364
+
365
+ def mkdir(self, path, **kwargs):
366
+ self._call("MKDIRS", method="put", path=path)
367
+
368
+ def makedirs(self, path, exist_ok=False):
369
+ if exist_ok is False and self.exists(path):
370
+ raise FileExistsError(path)
371
+ self.mkdir(path)
372
+
373
+ def mv(self, path1, path2, **kwargs):
374
+ self._call("RENAME", method="put", path=path1, destination=path2)
375
+
376
+ def rm(self, path, recursive=False, **kwargs):
377
+ self._call(
378
+ "DELETE",
379
+ method="delete",
380
+ path=path,
381
+ recursive="true" if recursive else "false",
382
+ )
383
+
384
+ def rm_file(self, path, **kwargs):
385
+ self.rm(path)
386
+
387
+ def cp_file(self, lpath, rpath, **kwargs):
388
+ with self.open(lpath) as lstream:
389
+ tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
390
+ # Perform an atomic copy (stream to a temporary file and
391
+ # move it to the actual destination).
392
+ try:
393
+ with self.open(tmp_fname, "wb") as rstream:
394
+ shutil.copyfileobj(lstream, rstream)
395
+ self.mv(tmp_fname, rpath)
396
+ except BaseException: # noqa
397
+ with suppress(FileNotFoundError):
398
+ self.rm(tmp_fname)
399
+ raise
400
+
401
+ def _apply_proxy(self, location):
402
+ if self.proxy and callable(self.proxy):
403
+ location = self.proxy(location)
404
+ elif self.proxy:
405
+ # as a dict
406
+ for k, v in self.proxy.items():
407
+ location = location.replace(k, v, 1)
408
+ return location
409
+
410
+
411
+ class WebHDFile(AbstractBufferedFile):
412
+ """A file living in HDFS over webHDFS"""
413
+
414
+ def __init__(self, fs, path, **kwargs):
415
+ super().__init__(fs, path, **kwargs)
416
+ kwargs = kwargs.copy()
417
+ if kwargs.get("permissions", None) is None:
418
+ kwargs.pop("permissions", None)
419
+ if kwargs.get("replication", None) is None:
420
+ kwargs.pop("replication", None)
421
+ self.permissions = kwargs.pop("permissions", 511)
422
+ tempdir = kwargs.pop("tempdir")
423
+ if kwargs.pop("autocommit", False) is False:
424
+ self.target = self.path
425
+ self.path = os.path.join(tempdir, str(uuid.uuid4()))
426
+
427
+ def _upload_chunk(self, final=False):
428
+ """Write one part of a multi-block file upload
429
+
430
+ Parameters
431
+ ==========
432
+ final: bool
433
+ This is the last block, so should complete file, if
434
+ self.autocommit is True.
435
+ """
436
+ out = self.fs.session.post(
437
+ self.location,
438
+ data=self.buffer.getvalue(),
439
+ headers={"content-type": "application/octet-stream"},
440
+ )
441
+ out.raise_for_status()
442
+ return True
443
+
444
+ def _initiate_upload(self):
445
+ """Create remote file/upload"""
446
+ kwargs = self.kwargs.copy()
447
+ if "a" in self.mode:
448
+ op, method = "APPEND", "POST"
449
+ else:
450
+ op, method = "CREATE", "PUT"
451
+ kwargs["overwrite"] = "true"
452
+ out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
453
+ location = self.fs._apply_proxy(out.headers["Location"])
454
+ if "w" in self.mode:
455
+ # create empty file to append to
456
+ out2 = self.fs.session.put(
457
+ location, headers={"content-type": "application/octet-stream"}
458
+ )
459
+ out2.raise_for_status()
460
+ # after creating empty file, change location to append to
461
+ out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
462
+ self.location = self.fs._apply_proxy(out2.headers["Location"])
463
+
464
+ def _fetch_range(self, start, end):
465
+ start = max(start, 0)
466
+ end = min(self.size, end)
467
+ if start >= end or start >= self.size:
468
+ return b""
469
+ out = self.fs._call(
470
+ "OPEN", path=self.path, offset=start, length=end - start, redirect=False
471
+ )
472
+ out.raise_for_status()
473
+ if "Location" in out.headers:
474
+ location = out.headers["Location"]
475
+ out2 = self.fs.session.get(self.fs._apply_proxy(location))
476
+ return out2.content
477
+ else:
478
+ return out.content
479
+
480
+ def commit(self):
481
+ self.fs.mv(self.path, self.target)
482
+
483
+ def discard(self):
484
+ self.fs.rm(self.path)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+
3
+ import fsspec
4
+ from fsspec.archive import AbstractArchiveFileSystem
5
+
6
+
7
+ class ZipFileSystem(AbstractArchiveFileSystem):
8
+ """Read/Write contents of ZIP archive as a file-system
9
+
10
+ Keeps file object open while instance lives.
11
+
12
+ This class is pickleable, but not necessarily thread-safe
13
+ """
14
+
15
+ root_marker = ""
16
+ protocol = "zip"
17
+ cachable = False
18
+
19
+ def __init__(
20
+ self,
21
+ fo="",
22
+ mode="r",
23
+ target_protocol=None,
24
+ target_options=None,
25
+ compression=zipfile.ZIP_STORED,
26
+ allowZip64=True,
27
+ compresslevel=None,
28
+ **kwargs,
29
+ ):
30
+ """
31
+ Parameters
32
+ ----------
33
+ fo: str or file-like
34
+ Contains ZIP, and must exist. If a str, will fetch file using
35
+ :meth:`~fsspec.open_files`, which must return one file exactly.
36
+ mode: str
37
+ Accept: "r", "w", "a"
38
+ target_protocol: str (optional)
39
+ If ``fo`` is a string, this value can be used to override the
40
+ FS protocol inferred from a URL
41
+ target_options: dict (optional)
42
+ Kwargs passed when instantiating the target FS, if ``fo`` is
43
+ a string.
44
+ compression, allowZip64, compresslevel: passed to ZipFile
45
+ Only relevant when creating a ZIP
46
+ """
47
+ super().__init__(self, **kwargs)
48
+ if mode not in set("rwa"):
49
+ raise ValueError(f"mode '{mode}' no understood")
50
+ self.mode = mode
51
+ if isinstance(fo, str):
52
+ if mode == "a":
53
+ m = "r+b"
54
+ else:
55
+ m = mode + "b"
56
+ fo = fsspec.open(
57
+ fo, mode=m, protocol=target_protocol, **(target_options or {})
58
+ )
59
+ self.force_zip_64 = allowZip64
60
+ self.of = fo
61
+ self.fo = fo.__enter__() # the whole instance is a context
62
+ self.zip = zipfile.ZipFile(
63
+ self.fo,
64
+ mode=mode,
65
+ compression=compression,
66
+ allowZip64=allowZip64,
67
+ compresslevel=compresslevel,
68
+ )
69
+ self.dir_cache = None
70
+
71
+ @classmethod
72
+ def _strip_protocol(cls, path):
73
+ # zip file paths are always relative to the archive root
74
+ return super()._strip_protocol(path).lstrip("/")
75
+
76
+ def __del__(self):
77
+ if hasattr(self, "zip"):
78
+ self.close()
79
+ del self.zip
80
+
81
+ def close(self):
82
+ """Commits any write changes to the file. Done on ``del`` too."""
83
+ self.zip.close()
84
+
85
+ def _get_dirs(self):
86
+ if self.dir_cache is None or self.mode in set("wa"):
87
+ # when writing, dir_cache is always in the ZipFile's attributes,
88
+ # not read from the file.
89
+ files = self.zip.infolist()
90
+ self.dir_cache = {
91
+ dirname.rstrip("/"): {
92
+ "name": dirname.rstrip("/"),
93
+ "size": 0,
94
+ "type": "directory",
95
+ }
96
+ for dirname in self._all_dirnames(self.zip.namelist())
97
+ }
98
+ for z in files:
99
+ f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
100
+ f.update(
101
+ {
102
+ "name": z.filename.rstrip("/"),
103
+ "size": z.file_size,
104
+ "type": ("directory" if z.is_dir() else "file"),
105
+ }
106
+ )
107
+ self.dir_cache[f["name"]] = f
108
+
109
+ def pipe_file(self, path, value, **kwargs):
110
+ # override upstream, because we know the exact file size in this case
111
+ self.zip.writestr(path, value, **kwargs)
112
+
113
+ def _open(
114
+ self,
115
+ path,
116
+ mode="rb",
117
+ block_size=None,
118
+ autocommit=True,
119
+ cache_options=None,
120
+ **kwargs,
121
+ ):
122
+ path = self._strip_protocol(path)
123
+ if "r" in mode and self.mode in set("wa"):
124
+ if self.exists(path):
125
+ raise OSError("ZipFS can only be open for reading or writing, not both")
126
+ raise FileNotFoundError(path)
127
+ if "r" in self.mode and "w" in mode:
128
+ raise OSError("ZipFS can only be open for reading or writing, not both")
129
+ out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
130
+ if "r" in mode:
131
+ info = self.info(path)
132
+ out.size = info["size"]
133
+ out.name = info["name"]
134
+ return out
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import gzip
3
+ import json
4
+ import os
5
+ import threading
6
+ from collections import ChainMap
7
+ from http.server import BaseHTTPRequestHandler, HTTPServer
8
+
9
+ import pytest
10
+
11
+ requests = pytest.importorskip("requests")
12
+ port = 9898
13
+ data = b"\n".join([b"some test data"] * 1000)
14
+ realfile = f"http://127.0.0.1:{port}/index/realfile"
15
+ index = b'<a href="%s">Link</a>' % realfile.encode()
16
+ listing = open(
17
+ os.path.join(os.path.dirname(__file__), "data", "listing.html"), "rb"
18
+ ).read()
19
+ win = os.name == "nt"
20
+
21
+
22
+ def _make_listing(*paths):
23
+ return "\n".join(
24
+ f'<a href="http://127.0.0.1:{port}{f}">Link_{i}</a>'
25
+ for i, f in enumerate(paths)
26
+ ).encode()
27
+
28
+
29
+ @pytest.fixture
30
+ def reset_files():
31
+ yield
32
+
33
+ # Reset the newly added files after the
34
+ # test is completed.
35
+ HTTPTestHandler.dynamic_files.clear()
36
+
37
+
38
+ class HTTPTestHandler(BaseHTTPRequestHandler):
39
+ static_files = {
40
+ "/index/realfile": data,
41
+ "/index/otherfile": data,
42
+ "/index": index,
43
+ "/data/20020401": listing,
44
+ "/simple/": _make_listing("/simple/file", "/simple/dir/"),
45
+ "/simple/file": data,
46
+ "/simple/dir/": _make_listing("/simple/dir/file"),
47
+ "/simple/dir/file": data,
48
+ }
49
+ dynamic_files = {}
50
+
51
+ files = ChainMap(dynamic_files, static_files)
52
+
53
+ def __init__(self, *args, **kwargs):
54
+ super().__init__(*args, **kwargs)
55
+
56
+ def _respond(self, code=200, headers=None, data=b""):
57
+ headers = headers or {}
58
+ headers.update({"User-Agent": "test"})
59
+ self.send_response(code)
60
+ for k, v in headers.items():
61
+ self.send_header(k, str(v))
62
+ self.end_headers()
63
+ if data:
64
+ self.wfile.write(data)
65
+
66
+ def do_GET(self):
67
+ file_path = self.path
68
+ if file_path.endswith("/") and file_path.rstrip("/") in self.files:
69
+ file_path = file_path.rstrip("/")
70
+ file_data = self.files.get(file_path)
71
+ if "give_path" in self.headers:
72
+ return self._respond(200, data=json.dumps({"path": self.path}).encode())
73
+ if "redirect" in self.headers and file_path != "/index/realfile":
74
+ new_url = f"http://127.0.0.1:{port}/index/realfile"
75
+ return self._respond(301, {"Location": new_url})
76
+ if file_data is None:
77
+ return self._respond(404)
78
+
79
+ status = 200
80
+ content_range = f"bytes 0-{len(file_data) - 1}/{len(file_data)}"
81
+ if ("Range" in self.headers) and ("ignore_range" not in self.headers):
82
+ ran = self.headers["Range"]
83
+ b, ran = ran.split("=")
84
+ start, end = ran.split("-")
85
+ if start:
86
+ content_range = f"bytes {start}-{end}/{len(file_data)}"
87
+ file_data = file_data[int(start) : (int(end) + 1) if end else None]
88
+ else:
89
+ # suffix only
90
+ l = len(file_data)
91
+ content_range = f"bytes {l - int(end)}-{l - 1}/{l}"
92
+ file_data = file_data[-int(end) :]
93
+ if "use_206" in self.headers:
94
+ status = 206
95
+ if "give_length" in self.headers:
96
+ if "gzip_encoding" in self.headers:
97
+ file_data = gzip.compress(file_data)
98
+ response_headers = {
99
+ "Content-Length": len(file_data),
100
+ "Content-Encoding": "gzip",
101
+ }
102
+ else:
103
+ response_headers = {"Content-Length": len(file_data)}
104
+ self._respond(status, response_headers, file_data)
105
+ elif "give_range" in self.headers:
106
+ self._respond(status, {"Content-Range": content_range}, file_data)
107
+ elif "give_mimetype" in self.headers:
108
+ self._respond(
109
+ status, {"Content-Type": "text/html; charset=utf-8"}, file_data
110
+ )
111
+ else:
112
+ self._respond(status, data=file_data)
113
+
114
+ def do_POST(self):
115
+ length = self.headers.get("Content-Length")
116
+ file_path = self.path.rstrip("/")
117
+ if length is None:
118
+ assert self.headers.get("Transfer-Encoding") == "chunked"
119
+ self.files[file_path] = b"".join(self.read_chunks())
120
+ else:
121
+ self.files[file_path] = self.rfile.read(length)
122
+ self._respond(200)
123
+
124
+ do_PUT = do_POST
125
+
126
+ def read_chunks(self):
127
+ length = -1
128
+ while length != 0:
129
+ line = self.rfile.readline().strip()
130
+ if len(line) == 0:
131
+ length = 0
132
+ else:
133
+ length = int(line, 16)
134
+ yield self.rfile.read(length)
135
+ self.rfile.readline()
136
+
137
+ def do_HEAD(self):
138
+ if "head_not_auth" in self.headers:
139
+ return self._respond(
140
+ 403, {"Content-Length": 123}, b"not authorized for HEAD request"
141
+ )
142
+ elif "head_ok" not in self.headers:
143
+ return self._respond(405)
144
+
145
+ file_path = self.path.rstrip("/")
146
+ file_data = self.files.get(file_path)
147
+ if file_data is None:
148
+ return self._respond(404)
149
+
150
+ if ("give_length" in self.headers) or ("head_give_length" in self.headers):
151
+ response_headers = {"Content-Length": len(file_data)}
152
+ if "zero_length" in self.headers:
153
+ response_headers["Content-Length"] = 0
154
+ elif "gzip_encoding" in self.headers:
155
+ file_data = gzip.compress(file_data)
156
+ response_headers["Content-Encoding"] = "gzip"
157
+ response_headers["Content-Length"] = len(file_data)
158
+
159
+ self._respond(200, response_headers)
160
+ elif "give_range" in self.headers:
161
+ self._respond(
162
+ 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"}
163
+ )
164
+ elif "give_etag" in self.headers:
165
+ self._respond(200, {"ETag": "xxx"})
166
+ else:
167
+ self._respond(200) # OK response, but no useful info
168
+
169
+
170
+ @contextlib.contextmanager
171
+ def serve():
172
+ server_address = ("", port)
173
+ httpd = HTTPServer(server_address, HTTPTestHandler)
174
+ th = threading.Thread(target=httpd.serve_forever)
175
+ th.daemon = True
176
+ th.start()
177
+ try:
178
+ yield f"http://127.0.0.1:{port}"
179
+ finally:
180
+ httpd.socket.close()
181
+ httpd.shutdown()
182
+ th.join()
183
+
184
+
185
+ @pytest.fixture(scope="module")
186
+ def server():
187
+ with serve() as s:
188
+ yield s
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests the spec, using memoryfs"""
2
+
3
+ import contextlib
4
+ import os
5
+ import pickle
6
+ import tempfile
7
+ from unittest.mock import Mock
8
+
9
+ import pytest
10
+
11
+ import fsspec
12
+ from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
13
+
14
+
15
+ def test_idempotent():
16
+ MemoryFileSystem.clear_instance_cache()
17
+ fs = MemoryFileSystem()
18
+ fs2 = MemoryFileSystem()
19
+ assert fs is fs2
20
+ assert MemoryFileSystem.current() is fs2
21
+
22
+ MemoryFileSystem.clear_instance_cache()
23
+ assert not MemoryFileSystem._cache
24
+
25
+ fs2 = MemoryFileSystem().current()
26
+ assert fs == fs2
27
+
28
+
29
+ def test_pickle():
30
+ fs = MemoryFileSystem()
31
+ fs2 = pickle.loads(pickle.dumps(fs))
32
+ assert fs == fs2
33
+
34
+
35
+ def test_class_methods():
36
+ assert MemoryFileSystem._strip_protocol("memory://stuff") == "/stuff"
37
+ assert MemoryFileSystem._strip_protocol("stuff") == "/stuff"
38
+ assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff"
39
+
40
+ assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {}
41
+
42
+
43
+ def test_multi(m):
44
+ m.pipe("/afile", b"data")
45
+ fs, token, paths = fsspec.core.get_fs_token_paths(["/afile", "/afile"])
46
+ assert len(paths) == 2
47
+
48
+
49
+ def test_get_put(tmpdir, m):
50
+ tmpdir = str(tmpdir)
51
+ fn = os.path.join(tmpdir, "one")
52
+ open(fn, "wb").write(b"one")
53
+ os.mkdir(os.path.join(tmpdir, "dir"))
54
+ fn2 = os.path.join(tmpdir, "dir", "two")
55
+ open(fn2, "wb").write(b"two")
56
+
57
+ fs = MemoryFileSystem()
58
+ fs.put(fn, "/afile")
59
+ assert fs.cat("/afile") == b"one"
60
+
61
+ fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data")
62
+ fn3 = os.path.join(tmpdir, "three")
63
+ fs.get("/bfile", fn3)
64
+ assert open(fn3, "rb").read() == b"data"
65
+
66
+ fs.put(tmpdir, "/more", recursive=True)
67
+ assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"]
68
+
69
+ @contextlib.contextmanager
70
+ def tmp_chdir(path):
71
+ curdir = os.getcwd()
72
+ os.chdir(path)
73
+ try:
74
+ yield
75
+ finally:
76
+ os.chdir(curdir)
77
+
78
+ with tmp_chdir(os.path.join(tmpdir, os.path.pardir)):
79
+ fs.put(os.path.basename(tmpdir), "/moretwo", recursive=True)
80
+ assert fs.find("/moretwo") == [
81
+ "/moretwo/dir/two",
82
+ "/moretwo/one",
83
+ "/moretwo/three",
84
+ ]
85
+
86
+ with tmp_chdir(tmpdir):
87
+ fs.put(os.path.curdir, "/morethree", recursive=True)
88
+ assert fs.find("/morethree") == [
89
+ "/morethree/dir/two",
90
+ "/morethree/one",
91
+ "/morethree/three",
92
+ ]
93
+
94
+ for f in [fn, fn2, fn3]:
95
+ os.remove(f)
96
+ os.rmdir(os.path.join(tmpdir, "dir"))
97
+
98
+ fs.get("/more/", tmpdir + "/", recursive=True)
99
+ assert open(fn3, "rb").read() == b"data"
100
+ assert open(fn, "rb").read() == b"one"
101
+
102
+
103
+ def test_du(m):
104
+ fs = MemoryFileSystem()
105
+ fs.store.update(
106
+ {
107
+ "/dir/afile": MemoryFile(fs, "/afile", b"a"),
108
+ "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"),
109
+ "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"),
110
+ }
111
+ )
112
+ assert fs.du("/dir") == 6
113
+ assert fs.du("/dir", total=False) == {
114
+ "/dir/afile": 1,
115
+ "/dir/dirb/afile": 2,
116
+ "/dir/dirb/bfile": 3,
117
+ }
118
+ assert fs.du("/dir", withdirs=True) == 6
119
+ assert fs.du("/dir", total=False, withdirs=True) == {
120
+ "/dir": 0,
121
+ "/dir/afile": 1,
122
+ "/dir/dirb": 0,
123
+ "/dir/dirb/afile": 2,
124
+ "/dir/dirb/bfile": 3,
125
+ }
126
+ with pytest.raises(ValueError):
127
+ assert fs.du("/dir", maxdepth=0) == 1
128
+ assert fs.du("/dir", total=False, withdirs=True, maxdepth=1) == {
129
+ "/dir": 0,
130
+ "/dir/afile": 1,
131
+ "/dir/dirb": 0,
132
+ }
133
+
134
+ # Size of file only.
135
+ assert fs.du("/dir/afile") == 1
136
+ assert fs.du("/dir/afile", withdirs=True) == 1
137
+
138
+
139
+ def test_head_tail(m):
140
+ fs = MemoryFileSystem()
141
+ with fs.open("/myfile", "wb") as f:
142
+ f.write(b"I had a nice big cabbage")
143
+ assert fs.head("/myfile", 5) == b"I had"
144
+ assert fs.tail("/myfile", 7) == b"cabbage"
145
+
146
+
147
+ def test_move(m):
148
+ fs = MemoryFileSystem()
149
+ with fs.open("/myfile", "wb") as f:
150
+ f.write(b"I had a nice big cabbage")
151
+ fs.move("/myfile", "/otherfile")
152
+ assert not fs.exists("/myfile")
153
+ assert fs.info("/otherfile")
154
+ assert isinstance(fs.ukey("/otherfile"), str)
155
+
156
+
157
+ def test_recursive_get_put(tmpdir, m):
158
+ fs = MemoryFileSystem()
159
+ os.makedirs(f"{tmpdir}/nest")
160
+ for file in ["one", "two", "nest/other"]:
161
+ with open(f"{tmpdir}/{file}", "wb") as f:
162
+ f.write(b"data")
163
+
164
+ fs.put(str(tmpdir), "test", recursive=True)
165
+
166
+ # get to directory with slash
167
+ d = tempfile.mkdtemp()
168
+ fs.get("test/", d, recursive=True)
169
+ for file in ["one", "two", "nest/other"]:
170
+ with open(f"{d}/{file}", "rb") as f:
171
+ f.read() == b"data"
172
+
173
+ # get to directory without slash
174
+ d = tempfile.mkdtemp()
175
+ fs.get("test", d, recursive=True)
176
+ for file in ["test/one", "test/two", "test/nest/other"]:
177
+ with open(f"{d}/{file}", "rb") as f:
178
+ f.read() == b"data"
179
+
180
+
181
+ def test_pipe_cat(m):
182
+ fs = MemoryFileSystem()
183
+ fs.pipe("afile", b"contents")
184
+ assert fs.cat("afile") == b"contents"
185
+
186
+ data = {"/bfile": b"more", "/cfile": b"stuff"}
187
+ fs.pipe(data)
188
+ assert fs.cat(list(data)) == data
189
+
190
+
191
+ def test_read_block_delimiter(m):
192
+ fs = MemoryFileSystem()
193
+ with fs.open("/myfile", "wb") as f:
194
+ f.write(b"some\nlines\nof\ntext")
195
+ assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n"
196
+ assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n"
197
+ assert fs.read_block("/myfile", 6, 2, b"\n") == b""
198
+ assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n"
199
+ assert fs.read_block("/myfile", 12, 6, b"\n") == b"text"
200
+ assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile")
201
+
202
+
203
+ def test_open_text(m):
204
+ fs = MemoryFileSystem()
205
+ with fs.open("/myfile", "wb") as f:
206
+ f.write(b"some\nlines\nof\ntext")
207
+ f = fs.open("/myfile", "r", encoding="latin1")
208
+ assert f.encoding == "latin1"
209
+
210
+
211
+ def test_read_text(m):
212
+ with m.open("/myfile", "w", encoding="utf-8") as f:
213
+ f.write("some\nlines\nof\ntext")
214
+ assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
215
+
216
+
217
+ def test_write_text(m):
218
+ m.write_text("/myfile", "some\nlines\nof\ntext", encoding="utf-8")
219
+ assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
220
+
221
+
222
+ def test_chained_fs():
223
+ d1 = tempfile.mkdtemp()
224
+ d2 = tempfile.mkdtemp()
225
+ f1 = os.path.join(d1, "f1")
226
+ with open(f1, "wb") as f:
227
+ f.write(b"test")
228
+
229
+ of = fsspec.open(
230
+ f"simplecache::file://{f1}",
231
+ simplecache={"cache_storage": d2, "same_names": True},
232
+ )
233
+ with of as f:
234
+ assert f.read() == b"test"
235
+
236
+ assert os.listdir(d2) == ["f1"]
237
+
238
+
239
+ @pytest.mark.xfail(reason="see issue #334", strict=True)
240
+ def test_multilevel_chained_fs():
241
+ """This test reproduces fsspec/filesystem_spec#334"""
242
+ import zipfile
243
+
244
+ d1 = tempfile.mkdtemp()
245
+ f1 = os.path.join(d1, "f1.zip")
246
+ with zipfile.ZipFile(f1, mode="w") as z:
247
+ # filename, content
248
+ z.writestr("foo.txt", "foo.txt")
249
+ z.writestr("bar.txt", "bar.txt")
250
+
251
+ # We expected this to be the correct syntax
252
+ with pytest.raises(IsADirectoryError):
253
+ of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}")
254
+ assert len(of) == 2
255
+
256
+ # But this is what is actually valid...
257
+ of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://")
258
+
259
+ assert len(of) == 2
260
+ for open_file in of:
261
+ with open_file as f:
262
+ assert f.read().decode("utf-8") == f.name
263
+
264
+
265
+ def test_multilevel_chained_fs_zip_zip_file():
266
+ """This test reproduces fsspec/filesystem_spec#334"""
267
+ import zipfile
268
+
269
+ d1 = tempfile.mkdtemp()
270
+ f1 = os.path.join(d1, "f1.zip")
271
+ f2 = os.path.join(d1, "f2.zip")
272
+ with zipfile.ZipFile(f1, mode="w") as z:
273
+ # filename, content
274
+ z.writestr("foo.txt", "foo.txt")
275
+ z.writestr("bar.txt", "bar.txt")
276
+
277
+ with zipfile.ZipFile(f2, mode="w") as z:
278
+ with open(f1, "rb") as f:
279
+ z.writestr("f1.zip", f.read())
280
+
281
+ # We expected this to be the correct syntax
282
+ of = fsspec.open_files(f"zip://*.txt::zip://f1.zip::file://{f2}")
283
+
284
+ assert len(of) == 2
285
+ for open_file in of:
286
+ with open_file as f:
287
+ assert f.read().decode("utf-8") == f.name
288
+
289
+
290
+ def test_chained_equivalent():
291
+ d1 = tempfile.mkdtemp()
292
+ d2 = tempfile.mkdtemp()
293
+ f1 = os.path.join(d1, "f1")
294
+ with open(f1, "wb") as f:
295
+ f.write(b"test1")
296
+
297
+ of = fsspec.open(
298
+ f"simplecache::file://{f1}",
299
+ simplecache={"cache_storage": d2, "same_names": True},
300
+ )
301
+ of2 = fsspec.open(
302
+ f"simplecache://{f1}",
303
+ cache_storage=d2,
304
+ same_names=True,
305
+ target_protocol="file",
306
+ target_options={},
307
+ )
308
+ # the following line passes by fluke - they are not quite the same instance,
309
+ # since the parameters don't quite match. Also, the url understood by the two
310
+ # of s are not the same (path gets munged a bit differently)
311
+ assert of.fs == of2.fs
312
+ assert hash(of.fs) == hash(of2.fs)
313
+ assert of.open().read() == of2.open().read()
314
+
315
+
316
+ def test_chained_fs_multi():
317
+ d1 = tempfile.mkdtemp()
318
+ d2 = tempfile.mkdtemp()
319
+ f1 = os.path.join(d1, "f1")
320
+ f2 = os.path.join(d1, "f2")
321
+ with open(f1, "wb") as f:
322
+ f.write(b"test1")
323
+ with open(f2, "wb") as f:
324
+ f.write(b"test2")
325
+
326
+ of = fsspec.open_files(
327
+ f"simplecache::file://{d1}/*",
328
+ simplecache={"cache_storage": d2, "same_names": True},
329
+ )
330
+ with of[0] as f:
331
+ assert f.read() == b"test1"
332
+ with of[1] as f:
333
+ assert f.read() == b"test2"
334
+
335
+ assert sorted(os.listdir(d2)) == ["f1", "f2"]
336
+
337
+ d2 = tempfile.mkdtemp()
338
+
339
+ of = fsspec.open_files(
340
+ [f"simplecache::file://{f1}", f"simplecache::file://{f2}"],
341
+ simplecache={"cache_storage": d2, "same_names": True},
342
+ )
343
+ with of[0] as f:
344
+ assert f.read() == b"test1"
345
+ with of[1] as f:
346
+ assert f.read() == b"test2"
347
+
348
+ assert sorted(os.listdir(d2)) == ["f1", "f2"]
349
+
350
+
351
+ def test_chained_fo():
352
+ import zipfile
353
+
354
+ d1 = tempfile.mkdtemp()
355
+ f1 = os.path.join(d1, "temp.zip")
356
+ d3 = tempfile.mkdtemp()
357
+ with zipfile.ZipFile(f1, mode="w") as z:
358
+ z.writestr("afile", b"test")
359
+
360
+ of = fsspec.open(f"zip://afile::file://{f1}")
361
+ with of as f:
362
+ assert f.read() == b"test"
363
+
364
+ of = fsspec.open_files(f"zip://*::file://{f1}")
365
+ with of[0] as f:
366
+ assert f.read() == b"test"
367
+
368
+ of = fsspec.open_files(
369
+ f"simplecache::zip://*::file://{f1}",
370
+ simplecache={"cache_storage": d3, "same_names": True},
371
+ )
372
+ with of[0] as f:
373
+ assert f.read() == b"test"
374
+ assert "afile" in os.listdir(d3)
375
+
376
+
377
+ def test_url_to_fs():
378
+ url = "memory://a.txt"
379
+ fs, url2 = fsspec.core.url_to_fs(url)
380
+
381
+ assert isinstance(fs, MemoryFileSystem)
382
+ assert url2 == "/a.txt"
383
+
384
+
385
+ def test_walk(m):
386
+ # depth = 0
387
+ dir1 = "/dir1"
388
+ # depth = 1 (2 dirs, 1 file)
389
+ dir11 = dir1 + "/dir11"
390
+ dir12 = dir1 + "/dir12"
391
+ file11 = dir1 + "/file11"
392
+ # depth = 2
393
+ dir111 = dir11 + "/dir111"
394
+ file111 = dir11 + "/file111"
395
+ file121 = dir12 + "/file121"
396
+ # depth = 3
397
+ file1111 = dir111 + "/file1111"
398
+
399
+ m.mkdir(dir111) # Creates parents too
400
+ m.mkdir(dir12) # Creates parents too
401
+ m.touch(file11)
402
+ m.touch(file111)
403
+ m.touch(file121)
404
+ m.touch(file1111)
405
+
406
+ # No maxdepth
407
+ assert list(m.walk(dir1, topdown=True)) == [
408
+ (dir1, ["dir11", "dir12"], ["file11"]),
409
+ (dir11, ["dir111"], ["file111"]),
410
+ (dir111, [], ["file1111"]),
411
+ (dir12, [], ["file121"]),
412
+ ]
413
+ assert list(m.walk(dir1, topdown=False)) == [
414
+ (dir111, [], ["file1111"]),
415
+ (dir11, ["dir111"], ["file111"]),
416
+ (dir12, [], ["file121"]),
417
+ (dir1, ["dir11", "dir12"], ["file11"]),
418
+ ]
419
+
420
+ # maxdepth=2
421
+ assert list(m.walk(dir1, maxdepth=2, topdown=True)) == [
422
+ (dir1, ["dir11", "dir12"], ["file11"]),
423
+ (dir11, ["dir111"], ["file111"]),
424
+ (dir12, [], ["file121"]),
425
+ ]
426
+ assert list(m.walk(dir1, maxdepth=2, topdown=False)) == [
427
+ (dir11, ["dir111"], ["file111"]),
428
+ (dir12, [], ["file121"]),
429
+ (dir1, ["dir11", "dir12"], ["file11"]),
430
+ ]
431
+
432
+ # maxdepth=1
433
+ assert list(m.walk(dir1, maxdepth=1, topdown=True)) == [
434
+ (dir1, ["dir11", "dir12"], ["file11"]),
435
+ ]
436
+ assert list(m.walk(dir1, maxdepth=1, topdown=False)) == [
437
+ (dir1, ["dir11", "dir12"], ["file11"]),
438
+ ]
439
+
440
+ # maxdepth=0
441
+ with pytest.raises(ValueError):
442
+ list(m.walk(dir1, maxdepth=0, topdown=True))
443
+ with pytest.raises(ValueError):
444
+ list(m.walk(dir1, maxdepth=0, topdown=False))
445
+
446
+ # prune dir111
447
+ def _walk(*args, **kwargs):
448
+ for path, dirs, files in m.walk(*args, **kwargs):
449
+ yield (path, dirs.copy(), files)
450
+ if "dir111" in dirs:
451
+ dirs.remove("dir111")
452
+
453
+ assert list(_walk(dir1, topdown=True)) == [
454
+ (dir1, ["dir11", "dir12"], ["file11"]),
455
+ (dir11, ["dir111"], ["file111"]),
456
+ (dir12, [], ["file121"]),
457
+ ]
458
+ assert list(_walk(dir1, topdown=False)) == [
459
+ (dir111, [], ["file1111"]),
460
+ (dir11, ["dir111"], ["file111"]),
461
+ (dir12, [], ["file121"]),
462
+ (dir1, ["dir11", "dir12"], ["file11"]),
463
+ ]
464
+
465
+ # reverse dirs order
466
+ def _walk(*args, **kwargs):
467
+ for path, dirs, files in m.walk(*args, **kwargs):
468
+ yield (path, dirs.copy(), files)
469
+ dirs.reverse()
470
+
471
+ assert list(_walk(dir1, topdown=True)) == [
472
+ (dir1, ["dir11", "dir12"], ["file11"]),
473
+ # Here dir12 comes before dir11
474
+ (dir12, [], ["file121"]),
475
+ (dir11, ["dir111"], ["file111"]),
476
+ (dir111, [], ["file1111"]),
477
+ ]
478
+ assert list(_walk(dir1, topdown=False)) == [
479
+ (dir111, [], ["file1111"]),
480
+ (dir11, ["dir111"], ["file111"]),
481
+ (dir12, [], ["file121"]),
482
+ (dir1, ["dir11", "dir12"], ["file11"]),
483
+ ]
484
+
485
+ # on_error omit by default
486
+ assert list(m.walk("do_not_exist")) == []
487
+ # on_error omit
488
+ assert list(m.walk("do_not_exist", on_error="omit")) == []
489
+ # on_error raise
490
+ with pytest.raises(FileNotFoundError):
491
+ list(m.walk("do_not_exist", on_error="raise"))
492
+ # on_error callable function
493
+ mock = Mock()
494
+ assert list(m.walk("do_not_exist", on_error=mock.onerror)) == []
495
+ mock.onerror.assert_called()
496
+ assert mock.onerror.call_args.kwargs == {}
497
+ assert len(mock.onerror.call_args.args) == 1
498
+ assert isinstance(mock.onerror.call_args.args[0], FileNotFoundError)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import inspect
3
+ import io
4
+ import os
5
+ import time
6
+
7
+ import pytest
8
+
9
+ import fsspec
10
+ import fsspec.asyn
11
+ from fsspec.asyn import _run_coros_in_chunks
12
+
13
+
14
+ def test_sync_methods():
15
+ inst = fsspec.asyn.AsyncFileSystem()
16
+ assert inspect.iscoroutinefunction(inst._info)
17
+ assert hasattr(inst, "info")
18
+ assert inst.info.__qualname__ == "AsyncFileSystem._info"
19
+ assert not inspect.iscoroutinefunction(inst.info)
20
+
21
+
22
+ def test_when_sync_methods_are_disabled():
23
+ class TestFS(fsspec.asyn.AsyncFileSystem):
24
+ mirror_sync_methods = False
25
+
26
+ inst = TestFS()
27
+ assert inspect.iscoroutinefunction(inst._info)
28
+ assert not inspect.iscoroutinefunction(inst.info)
29
+ assert inst.info.__qualname__ == "AbstractFileSystem.info"
30
+
31
+
32
+ def test_interrupt():
33
+ loop = fsspec.asyn.get_loop()
34
+
35
+ async def f():
36
+ await asyncio.sleep(1000000)
37
+ return True
38
+
39
+ fut = asyncio.run_coroutine_threadsafe(f(), loop)
40
+ time.sleep(0.01) # task launches
41
+ out = fsspec.asyn._dump_running_tasks(with_task=True)
42
+ task = out[0]["task"]
43
+ assert task.done() and fut.done()
44
+ assert isinstance(fut.exception(), fsspec.asyn.FSSpecCoroutineCancel)
45
+
46
+
47
+ class _DummyAsyncKlass:
48
+ def __init__(self):
49
+ self.loop = fsspec.asyn.get_loop()
50
+
51
+ async def _dummy_async_func(self):
52
+ # Sleep 1 second function to test timeout
53
+ await asyncio.sleep(1)
54
+ return True
55
+
56
+ async def _bad_multiple_sync(self):
57
+ fsspec.asyn.sync_wrapper(_DummyAsyncKlass._dummy_async_func)(self)
58
+ return True
59
+
60
+ dummy_func = fsspec.asyn.sync_wrapper(_dummy_async_func)
61
+ bad_multiple_sync_func = fsspec.asyn.sync_wrapper(_bad_multiple_sync)
62
+
63
+
64
+ def test_sync_wrapper_timeout_on_less_than_expected_wait_time_not_finish_function():
65
+ test_obj = _DummyAsyncKlass()
66
+ with pytest.raises(fsspec.FSTimeoutError):
67
+ test_obj.dummy_func(timeout=0.1)
68
+
69
+
70
+ def test_sync_wrapper_timeout_on_more_than_expected_wait_time_will_finish_function():
71
+ test_obj = _DummyAsyncKlass()
72
+ assert test_obj.dummy_func(timeout=5)
73
+
74
+
75
+ def test_sync_wrapper_timeout_none_will_wait_func_finished():
76
+ test_obj = _DummyAsyncKlass()
77
+ assert test_obj.dummy_func(timeout=None)
78
+
79
+
80
+ def test_sync_wrapper_treat_timeout_0_as_none():
81
+ test_obj = _DummyAsyncKlass()
82
+ assert test_obj.dummy_func(timeout=0)
83
+
84
+
85
+ def test_sync_wrapper_bad_multiple_sync():
86
+ test_obj = _DummyAsyncKlass()
87
+ with pytest.raises(NotImplementedError):
88
+ test_obj.bad_multiple_sync_func(timeout=5)
89
+
90
+
91
+ def test_run_coros_in_chunks(monkeypatch):
92
+ total_running = 0
93
+
94
+ async def runner():
95
+ nonlocal total_running
96
+
97
+ total_running += 1
98
+ await asyncio.sleep(0)
99
+ if total_running > 4:
100
+ raise ValueError("More than 4 coroutines are running together")
101
+ total_running -= 1
102
+ return 1
103
+
104
+ async def main(**kwargs):
105
+ nonlocal total_running
106
+
107
+ total_running = 0
108
+ coros = [runner() for _ in range(32)]
109
+ results = await _run_coros_in_chunks(coros, **kwargs)
110
+ for result in results:
111
+ if isinstance(result, Exception):
112
+ raise result
113
+ return results
114
+
115
+ assert sum(asyncio.run(main(batch_size=4))) == 32
116
+
117
+ with pytest.raises(ValueError):
118
+ asyncio.run(main(batch_size=5))
119
+
120
+ with pytest.raises(ValueError):
121
+ asyncio.run(main(batch_size=-1))
122
+
123
+ assert sum(asyncio.run(main(batch_size=4))) == 32
124
+
125
+ monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 5)
126
+ with pytest.raises(ValueError):
127
+ asyncio.run(main())
128
+ assert sum(asyncio.run(main(batch_size=4))) == 32 # override
129
+
130
+ monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 4)
131
+ assert sum(asyncio.run(main())) == 32 # override
132
+
133
+
134
+ @pytest.mark.skipif(os.name != "nt", reason="only for windows")
135
+ def test_windows_policy():
136
+ from asyncio.windows_events import SelectorEventLoop
137
+
138
+ loop = fsspec.asyn.get_loop()
139
+ policy = asyncio.get_event_loop_policy()
140
+
141
+ # Ensure that the created loop always uses selector policy
142
+ assert isinstance(loop, SelectorEventLoop)
143
+
144
+ # Ensure that the global policy is not changed and it is
145
+ # set to the default one. This is important since the
146
+ # get_loop() method will temporarily override the policy
147
+ # with the one which uses selectors on windows, so this
148
+ # check ensures that we are restoring the old policy back
149
+ # after our change.
150
+ assert isinstance(policy, asyncio.DefaultEventLoopPolicy)
151
+
152
+
153
+ def test_running_async():
154
+ assert not fsspec.asyn.running_async()
155
+
156
+ async def go():
157
+ assert fsspec.asyn.running_async()
158
+
159
+ asyncio.run(go())
160
+
161
+
162
+ class DummyAsyncFS(fsspec.asyn.AsyncFileSystem):
163
+ _file_class = fsspec.asyn.AbstractAsyncStreamedFile
164
+
165
+ async def _info(self, path, **kwargs):
166
+ return {"name": "misc/foo.txt", "type": "file", "size": 100}
167
+
168
+ async def open_async(
169
+ self,
170
+ path,
171
+ mode="rb",
172
+ block_size=None,
173
+ autocommit=True,
174
+ cache_options=None,
175
+ **kwargs,
176
+ ):
177
+ return DummyAsyncStreamedFile(
178
+ self,
179
+ path,
180
+ mode,
181
+ block_size,
182
+ autocommit,
183
+ cache_options=cache_options,
184
+ **kwargs,
185
+ )
186
+
187
+
188
+ class DummyAsyncStreamedFile(fsspec.asyn.AbstractAsyncStreamedFile):
189
+ def __init__(self, fs, path, mode, block_size, autocommit, **kwargs):
190
+ super().__init__(fs, path, mode, block_size, autocommit, **kwargs)
191
+ self.temp_buffer = io.BytesIO(b"foo-bar" * 20)
192
+
193
+ async def _fetch_range(self, start, end):
194
+ return self.temp_buffer.read(end - start)
195
+
196
+ async def _initiate_upload(self):
197
+ # Reinitialize for new uploads.
198
+ self.temp_buffer = io.BytesIO()
199
+
200
+ async def _upload_chunk(self, final=False):
201
+ self.temp_buffer.write(self.buffer.getbuffer())
202
+
203
+ async def get_data(self):
204
+ return self.temp_buffer.getbuffer().tobytes()
205
+
206
+ async def get_data(self):
207
+ return self.temp_buffer.getbuffer().tobytes()
208
+
209
+
210
+ @pytest.mark.asyncio
211
+ async def test_async_streamed_file_write():
212
+ test_fs = DummyAsyncFS()
213
+ streamed_file = await test_fs.open_async("misc/foo.txt", mode="wb")
214
+ inp_data = "foo-bar".encode("utf8") * streamed_file.blocksize * 2
215
+ await streamed_file.write(inp_data)
216
+ assert streamed_file.loc == len(inp_data)
217
+ await streamed_file.close()
218
+ out_data = await streamed_file.get_data()
219
+ assert out_data.count(b"foo-bar") == streamed_file.blocksize * 2
220
+
221
+
222
+ @pytest.mark.asyncio
223
+ async def test_async_streamed_file_read():
224
+ test_fs = DummyAsyncFS()
225
+ streamed_file = await test_fs.open_async("misc/foo.txt", mode="rb")
226
+ assert (
227
+ await streamed_file.read(7 * 3) + await streamed_file.read(7 * 18)
228
+ == b"foo-bar" * 20
229
+ )
230
+ await streamed_file.close()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import string
3
+
4
+ import pytest
5
+
6
+ from fsspec.caching import (
7
+ BlockCache,
8
+ FirstChunkCache,
9
+ ReadAheadCache,
10
+ caches,
11
+ register_cache,
12
+ )
13
+ from fsspec.implementations.cached import WholeFileCacheFileSystem
14
+
15
+
16
+ def test_cache_getitem(Cache_imp):
17
+ cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
18
+ assert cacher._fetch(0, 4) == b"abcd"
19
+ assert cacher._fetch(None, 4) == b"abcd"
20
+ assert cacher._fetch(2, 4) == b"cd"
21
+
22
+
23
+ def test_block_cache_lru():
24
+ # BlockCache is a cache that stores blocks of data and uses LRU to evict
25
+ block_size = 4
26
+ cache = BlockCache(
27
+ block_size, letters_fetcher, len(string.ascii_letters), maxblocks=2
28
+ )
29
+ # miss
30
+ cache._fetch(0, 2)
31
+ assert cache.cache_info().misses == 1
32
+ assert cache.cache_info().currsize == 1
33
+ assert cache.total_requested_bytes == block_size * cache.miss_count
34
+ assert cache.size == 52
35
+
36
+ # hit
37
+ cache._fetch(0, 2)
38
+ assert cache.cache_info().misses == 1
39
+ assert cache.cache_info().currsize == 1
40
+ assert cache.total_requested_bytes == block_size * cache.miss_count
41
+
42
+ # hit
43
+ cache._fetch(0, 2)
44
+ assert cache.cache_info().misses == 1
45
+ assert cache.cache_info().currsize == 1
46
+ # this works as a counter since all the reads are from the cache
47
+ assert cache.hit_count == 3
48
+ assert cache.miss_count == 1
49
+ # so far only 4 bytes have been read using range requests
50
+ assert cache.total_requested_bytes == block_size * cache.miss_count
51
+
52
+ # miss
53
+ cache._fetch(4, 6)
54
+ assert cache.cache_info().misses == 2
55
+ assert cache.cache_info().currsize == 2
56
+ assert cache.total_requested_bytes == block_size * cache.miss_count
57
+
58
+ # miss & evict
59
+ cache._fetch(12, 13)
60
+ assert cache.cache_info().misses == 3
61
+ assert cache.cache_info().currsize == 2
62
+ assert cache.hit_count == 5
63
+ assert cache.miss_count == 3
64
+ assert cache.total_requested_bytes == block_size * cache.miss_count
65
+
66
+
67
+ def test_first_cache():
68
+ """
69
+ FirstChunkCache is a cache that only caches the first chunk of data
70
+ when some of that first block is requested.
71
+ """
72
+ block_size = 5
73
+ cache = FirstChunkCache(block_size, letters_fetcher, len(string.ascii_letters))
74
+ assert cache.cache is None
75
+ assert cache._fetch(12, 15) == letters_fetcher(12, 15)
76
+ assert cache.miss_count == 1
77
+ assert cache.hit_count == 0
78
+ assert cache.cache is None
79
+ total_requested_bytes = 15 - 12
80
+ assert cache.total_requested_bytes == total_requested_bytes
81
+
82
+ # because we overlap with the cache range, it will be cached
83
+ assert cache._fetch(3, 10) == letters_fetcher(3, 10)
84
+ assert cache.miss_count == 2
85
+ assert cache.hit_count == 0
86
+ # we'll read the first 5 and then the rest
87
+ total_requested_bytes += block_size + 5
88
+ assert cache.total_requested_bytes == total_requested_bytes
89
+
90
+ # partial hit again
91
+ assert cache._fetch(3, 10) == letters_fetcher(3, 10)
92
+ assert cache.miss_count == 2
93
+ assert cache.hit_count == 1
94
+ # we have the first 5 bytes cached
95
+ total_requested_bytes += 10 - 5
96
+ assert cache.total_requested_bytes == total_requested_bytes
97
+
98
+ assert cache.cache == letters_fetcher(0, 5)
99
+ assert cache._fetch(0, 4) == letters_fetcher(0, 4)
100
+ assert cache.hit_count == 2
101
+ assert cache.miss_count == 2
102
+ assert cache.total_requested_bytes == 18
103
+
104
+
105
+ def test_readahead_cache():
106
+ """
107
+ ReadAheadCache is a cache that reads ahead of the requested range.
108
+ If the access pattern is not sequential it will be very inefficient.
109
+ """
110
+ block_size = 5
111
+ cache = ReadAheadCache(block_size, letters_fetcher, len(string.ascii_letters))
112
+ assert cache._fetch(12, 15) == letters_fetcher(12, 15)
113
+ assert cache.miss_count == 1
114
+ assert cache.hit_count == 0
115
+ total_requested_bytes = 15 - 12 + block_size
116
+ assert cache.total_requested_bytes == total_requested_bytes
117
+
118
+ assert cache._fetch(3, 10) == letters_fetcher(3, 10)
119
+ assert cache.miss_count == 2
120
+ assert cache.hit_count == 0
121
+ assert len(cache.cache) == 12
122
+ total_requested_bytes += (10 - 3) + block_size
123
+ assert cache.total_requested_bytes == total_requested_bytes
124
+
125
+ # caache hit again
126
+ assert cache._fetch(3, 10) == letters_fetcher(3, 10)
127
+ assert cache.miss_count == 2
128
+ assert cache.hit_count == 1
129
+ assert len(cache.cache) == 12
130
+ assert cache.total_requested_bytes == total_requested_bytes
131
+ assert cache.cache == letters_fetcher(3, 15)
132
+
133
+ # cache miss
134
+ assert cache._fetch(0, 4) == letters_fetcher(0, 4)
135
+ assert cache.hit_count == 1
136
+ assert cache.miss_count == 3
137
+ assert len(cache.cache) == 9
138
+ total_requested_bytes += (4 - 0) + block_size
139
+ assert cache.total_requested_bytes == total_requested_bytes
140
+
141
+
142
+ def _fetcher(start, end):
143
+ return b"0" * (end - start)
144
+
145
+
146
+ def letters_fetcher(start, end):
147
+ return string.ascii_letters[start:end].encode()
148
+
149
+
150
+ not_parts_caches = {k: v for k, v in caches.items() if k != "parts"}
151
+
152
+
153
+ @pytest.fixture(params=not_parts_caches.values(), ids=list(not_parts_caches))
154
+ def Cache_imp(request):
155
+ return request.param
156
+
157
+
158
+ def test_cache_empty_file(Cache_imp):
159
+ blocksize = 5
160
+ size = 0
161
+ cache = Cache_imp(blocksize, _fetcher, size)
162
+ assert cache._fetch(0, 0) == b""
163
+
164
+
165
+ def test_cache_pickleable(Cache_imp):
166
+ blocksize = 5
167
+ size = 100
168
+ cache = Cache_imp(blocksize, _fetcher, size)
169
+ cache._fetch(0, 5) # fill in cache
170
+ unpickled = pickle.loads(pickle.dumps(cache))
171
+ assert isinstance(unpickled, Cache_imp)
172
+ assert unpickled.blocksize == blocksize
173
+ assert unpickled.size == size
174
+ assert unpickled._fetch(0, 10) == b"0" * 10
175
+
176
+
177
+ @pytest.mark.parametrize(
178
+ "size_requests",
179
+ [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
180
+ )
181
+ @pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
182
+ def test_cache_basic(Cache_imp, blocksize, size_requests):
183
+ cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
184
+
185
+ for start, end in size_requests:
186
+ result = cache._fetch(start, end)
187
+ expected = string.ascii_letters[start:end].encode()
188
+ assert result == expected
189
+
190
+
191
+ @pytest.mark.parametrize("strict", [True, False])
192
+ @pytest.mark.parametrize("sort", [True, False])
193
+ def test_known(sort, strict):
194
+ parts = {(10, 20): b"1" * 10, (20, 30): b"2" * 10, (0, 10): b"0" * 10}
195
+ if sort:
196
+ parts = dict(sorted(parts.items()))
197
+ c = caches["parts"](None, None, 100, parts, strict=strict)
198
+ assert (0, 30) in c.data # got consolidated
199
+ assert c._fetch(5, 15) == b"0" * 5 + b"1" * 5
200
+ assert c._fetch(15, 25) == b"1" * 5 + b"2" * 5
201
+ if strict:
202
+ # Over-read will raise error
203
+ with pytest.raises(ValueError):
204
+ # tries to call None fetcher
205
+ c._fetch(25, 35)
206
+ else:
207
+ # Over-read will be zero-padded
208
+ assert c._fetch(25, 35) == b"2" * 5 + b"\x00" * 5
209
+
210
+
211
+ def test_background(server, monkeypatch):
212
+ import threading
213
+ import time
214
+
215
+ import fsspec
216
+
217
+ head = {"head_ok": "true", "head_give_length": "true"}
218
+ urla = server + "/index/realfile"
219
+ h = fsspec.filesystem("http", headers=head)
220
+ thread_ids = {threading.current_thread().ident}
221
+ f = h.open(urla, block_size=5, cache_type="background")
222
+ orig = f.cache._fetch_block
223
+
224
+ def wrapped(*a, **kw):
225
+ thread_ids.add(threading.current_thread().ident)
226
+ return orig(*a, **kw)
227
+
228
+ f.cache._fetch_block = wrapped
229
+ assert len(thread_ids) == 1
230
+ f.read(1)
231
+ time.sleep(0.1) # second block is loading
232
+ assert len(thread_ids) == 2
233
+
234
+
235
+ def test_register_cache():
236
+ # just test that we have them populated and fail to re-add again unless overload
237
+ with pytest.raises(ValueError):
238
+ register_cache(BlockCache)
239
+ register_cache(BlockCache, clobber=True)
240
+
241
+
242
+ def test_cache_kwargs(mocker):
243
+ # test that kwargs are passed to the underlying filesystem after cache commit
244
+
245
+ fs = WholeFileCacheFileSystem(target_protocol="memory")
246
+ fs.touch("test")
247
+ fs.fs.put = mocker.MagicMock()
248
+
249
+ with fs.open("test", "wb", overwrite=True) as file_handle:
250
+ file_handle.write(b"foo")
251
+
252
+ # We don't care about the first parameter, just retrieve its expected value.
253
+ # It is a random location that cannot be predicted.
254
+ # The important thing is the 'overwrite' kwarg
255
+ fs.fs.put.assert_called_with(fs.fs.put.call_args[0][0], "/test", overwrite=True)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from fsspec.callbacks import Callback, TqdmCallback
4
+
5
+
6
+ def test_callbacks():
7
+ empty_callback = Callback()
8
+ assert empty_callback.call("something", somearg=None) is None
9
+
10
+ hooks = {"something": lambda *_, arg=None: arg + 2}
11
+ simple_callback = Callback(hooks=hooks)
12
+ assert simple_callback.call("something", arg=2) == 4
13
+
14
+ hooks = {"something": lambda *_, arg1=None, arg2=None: arg1 + arg2}
15
+ multi_arg_callback = Callback(hooks=hooks)
16
+ assert multi_arg_callback.call("something", arg1=2, arg2=2) == 4
17
+
18
+
19
+ def test_callbacks_as_callback():
20
+ empty_callback = Callback.as_callback(None)
21
+ assert empty_callback.call("something", arg="somearg") is None
22
+ assert Callback.as_callback(None) is Callback.as_callback(None)
23
+
24
+ hooks = {"something": lambda *_, arg=None: arg + 2}
25
+ real_callback = Callback.as_callback(Callback(hooks=hooks))
26
+ assert real_callback.call("something", arg=2) == 4
27
+
28
+
29
+ def test_callbacks_as_context_manager(mocker):
30
+ spy_close = mocker.spy(Callback, "close")
31
+
32
+ with Callback() as cb:
33
+ assert isinstance(cb, Callback)
34
+
35
+ spy_close.assert_called_once()
36
+
37
+
38
+ def test_callbacks_branched():
39
+ callback = Callback()
40
+
41
+ branch = callback.branched("path_1", "path_2")
42
+
43
+ assert branch is not callback
44
+ assert isinstance(branch, Callback)
45
+
46
+
47
+ @pytest.mark.asyncio
48
+ async def test_callbacks_branch_coro(mocker):
49
+ async_fn = mocker.AsyncMock(return_value=10)
50
+ callback = Callback()
51
+ wrapped_fn = callback.branch_coro(async_fn)
52
+ spy = mocker.spy(callback, "branched")
53
+
54
+ assert await wrapped_fn("path_1", "path_2", key="value") == 10
55
+
56
+ spy.assert_called_once_with("path_1", "path_2", key="value")
57
+ async_fn.assert_called_once_with(
58
+ "path_1", "path_2", callback=spy.spy_return, key="value"
59
+ )
60
+
61
+
62
+ def test_callbacks_wrap():
63
+ events = []
64
+
65
+ class TestCallback(Callback):
66
+ def relative_update(self, inc=1):
67
+ events.append(inc)
68
+
69
+ callback = TestCallback()
70
+ for _ in callback.wrap(range(10)):
71
+ ...
72
+
73
+ assert events == [1] * 10
74
+
75
+
76
+ @pytest.mark.parametrize("tqdm_kwargs", [{}, {"desc": "A custom desc"}])
77
+ def test_tqdm_callback(tqdm_kwargs, mocker):
78
+ pytest.importorskip("tqdm")
79
+ callback = TqdmCallback(tqdm_kwargs=tqdm_kwargs)
80
+ mocker.patch.object(callback, "_tqdm_cls")
81
+ callback.set_size(10)
82
+ for _ in callback.wrap(range(10)):
83
+ ...
84
+
85
+ assert callback.tqdm.update.call_count == 11
86
+ if not tqdm_kwargs:
87
+ callback._tqdm_cls.assert_called_with(total=10)
88
+ else:
89
+ callback._tqdm_cls.assert_called_with(total=10, **tqdm_kwargs)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+ import pytest
4
+
5
+ import fsspec.core
6
+ from fsspec.compression import compr, register_compression
7
+ from fsspec.utils import compressions, infer_compression
8
+
9
+
10
+ def test_infer_custom_compression():
11
+ """Inferred compression gets values from fsspec.compression.compr."""
12
+ assert infer_compression("fn.zip") == "zip"
13
+ assert infer_compression("fn.gz") == "gzip"
14
+ assert infer_compression("fn.unknown") is None
15
+ assert infer_compression("fn.test_custom") is None
16
+ assert infer_compression("fn.tst") is None
17
+
18
+ register_compression("test_custom", lambda f, **kwargs: f, "tst")
19
+
20
+ try:
21
+ assert infer_compression("fn.zip") == "zip"
22
+ assert infer_compression("fn.gz") == "gzip"
23
+ assert infer_compression("fn.unknown") is None
24
+ assert infer_compression("fn.test_custom") is None
25
+ assert infer_compression("fn.tst") == "test_custom"
26
+
27
+ # Duplicate registration in name or extension raises a value error.
28
+ with pytest.raises(ValueError):
29
+ register_compression("test_custom", lambda f, **kwargs: f, "tst")
30
+
31
+ with pytest.raises(ValueError):
32
+ register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
33
+ assert "test_conflicting" not in compr
34
+
35
+ # ...but can be forced.
36
+ register_compression(
37
+ "test_conflicting", lambda f, **kwargs: f, "tst", force=True
38
+ )
39
+ assert infer_compression("fn.zip") == "zip"
40
+ assert infer_compression("fn.gz") == "gzip"
41
+ assert infer_compression("fn.unknown") is None
42
+ assert infer_compression("fn.test_custom") is None
43
+ assert infer_compression("fn.tst") == "test_conflicting"
44
+
45
+ finally:
46
+ del compr["test_custom"]
47
+ del compr["test_conflicting"]
48
+ del compressions["tst"]
49
+
50
+
51
+ def test_infer_uppercase_compression():
52
+ assert infer_compression("fn.ZIP") == "zip"
53
+ assert infer_compression("fn.GZ") == "gzip"
54
+ assert infer_compression("fn.UNKNOWN") is None
55
+ assert infer_compression("fn.TEST_UPPERCASE") is None
56
+ assert infer_compression("fn.TEST") is None
57
+
58
+
59
+ def test_lzma_compression_name():
60
+ pytest.importorskip("lzma")
61
+ assert infer_compression("fn.xz") == "xz"
62
+ assert infer_compression("fn.lzma") == "lzma"
63
+
64
+
65
+ def test_lz4_compression(tmpdir):
66
+ """Infer lz4 compression for .lz4 files if lz4 is available."""
67
+ tmp_path = pathlib.Path(str(tmpdir))
68
+
69
+ lz4 = pytest.importorskip("lz4")
70
+
71
+ tmp_path.mkdir(exist_ok=True)
72
+
73
+ tdat = "foobar" * 100
74
+
75
+ with fsspec.core.open(
76
+ str(tmp_path / "out.lz4"), mode="wt", compression="infer"
77
+ ) as outfile:
78
+ outfile.write(tdat)
79
+
80
+ compressed = (tmp_path / "out.lz4").open("rb").read()
81
+ assert lz4.frame.decompress(compressed).decode() == tdat
82
+
83
+ with fsspec.core.open(
84
+ str(tmp_path / "out.lz4"), mode="rt", compression="infer"
85
+ ) as infile:
86
+ assert infile.read() == tdat
87
+
88
+ with fsspec.core.open(
89
+ str(tmp_path / "out.lz4"), mode="rt", compression="lz4"
90
+ ) as infile:
91
+ assert infile.read() == tdat
92
+
93
+
94
+ def test_zstd_compression(tmpdir):
95
+ """Infer zstd compression for .zst files if zstandard is available."""
96
+ tmp_path = pathlib.Path(str(tmpdir))
97
+
98
+ zstd = pytest.importorskip("zstandard")
99
+
100
+ tmp_path.mkdir(exist_ok=True)
101
+
102
+ tdat = "foobar" * 100
103
+
104
+ with fsspec.core.open(
105
+ str(tmp_path / "out.zst"), mode="wt", compression="infer"
106
+ ) as outfile:
107
+ outfile.write(tdat)
108
+
109
+ compressed = (tmp_path / "out.zst").open("rb").read()
110
+ assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat
111
+
112
+ with fsspec.core.open(
113
+ str(tmp_path / "out.zst"), mode="rt", compression="infer"
114
+ ) as infile:
115
+ assert infile.read() == tdat
116
+
117
+ with fsspec.core.open(
118
+ str(tmp_path / "out.zst"), mode="rt", compression="zstd"
119
+ ) as infile:
120
+ assert infile.read() == tdat
121
+
122
+ # fails in https://github.com/fsspec/filesystem_spec/issues/725
123
+ infile = fsspec.core.open(
124
+ str(tmp_path / "out.zst"), mode="rb", compression="infer"
125
+ ).open()
126
+
127
+ infile.close()
128
+
129
+
130
+ def test_snappy_compression(tmpdir):
131
+ """No registered compression for snappy, but can be specified."""
132
+ tmp_path = pathlib.Path(str(tmpdir))
133
+
134
+ snappy = pytest.importorskip("snappy")
135
+
136
+ tmp_path.mkdir(exist_ok=True)
137
+
138
+ tdat = "foobar" * 100
139
+
140
+ # Snappy isn't inferred.
141
+ with fsspec.core.open(
142
+ str(tmp_path / "out.snappy"), mode="wt", compression="infer"
143
+ ) as outfile:
144
+ outfile.write(tdat)
145
+ assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat
146
+
147
+ # but can be specified.
148
+ with fsspec.core.open(
149
+ str(tmp_path / "out.snappy"), mode="wt", compression="snappy"
150
+ ) as outfile:
151
+ outfile.write(tdat)
152
+
153
+ compressed = (tmp_path / "out.snappy").open("rb").read()
154
+ assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat
155
+
156
+ with fsspec.core.open(
157
+ str(tmp_path / "out.snappy"), mode="rb", compression="infer"
158
+ ) as infile:
159
+ assert infile.read() == compressed
160
+
161
+ with fsspec.core.open(
162
+ str(tmp_path / "out.snappy"), mode="rt", compression="snappy"
163
+ ) as infile:
164
+ assert infile.read() == tdat