diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbf8e4d64bc8690d045d9c95e538d205cf5d21fa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/_version.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/_version.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f792bb5e871d52b7732fd8da14ac02c33827bb63 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/_version.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/archive.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/archive.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e85f37c5799d9424bc77172620b4c4728ff1570 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/archive.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/asyn.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/asyn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0e302b861b16e56c2db717adab524430b5e65cd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/asyn.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63bd1ed9bb1ec52d4f09fbf49dfd38d748f41942 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/callbacks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/callbacks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc96096236cbb0abc044711057665416d4a27d23 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/callbacks.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/compression.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/compression.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec924c8aaa58ca0998407a81f41c0df342ab7cda Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/compression.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82a5c6fc332e0e05ba418171feeeb4ad2a72dda7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/conftest.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/conftest.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83bd636c293d85adea8fd55aa93c07fe3e6a8d4e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/conftest.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/core.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/core.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..911b76506878809d6c1d3a955f0d4c81bb37a2bf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/core.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/dircache.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/dircache.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb608acfca347e70b3b1b7eb7d24697ef0101b79 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/dircache.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/exceptions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/exceptions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..209a4f90939a24aaf650cbe98ea1db5237a4f4d1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/exceptions.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d290f900dc02ac1c3482080967a56cf64c33d89d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/generic.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/generic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..403802b9cfaab8f7aaa0318df160fd8a5a5028c0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/generic.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/gui.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/gui.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d631aa166c21550d92405759bec15a6e2537caf4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/gui.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/json.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/json.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..017996e92cf5f8354a4de42095cd4e08464d04aa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/json.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b53f8bea2bf4401134b5d407019b2f9c94b180f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/parquet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/parquet.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45c0773d89a7b4323e5042c129826babb61326a3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/parquet.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9325d7357a9d1dc976ef5a7a80716cacfd254af8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/transaction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/transaction.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad821874125e4fc62b4c6f1efd95545ed7f1ad95 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/transaction.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cf19fbce345d65a85e019ad8fedbbd3dc160246 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__init__.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccb55a8d7ebc93e70e2f4c58c11142f977ed9875 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2157d15e05b63c19ac9b758742a33b6cf2bb1e83 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e10e38d95aafe608d87957f530ff020d901678fc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/asyn_wrapper.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..742839be37d630b5a9d47486f262653b732acffa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a7cc9145597ba118e14d6bac7c17edfcfdc57fa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4883bc7e1970d10962bee66962c387277ca1b7e0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9a31d572c15db2d10fdc0c02077c66c66b651fc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b97d03b754f464c491fec59670a6a3e61bc5da15 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20381b91b848c27a10a4c729ec538fc2722787a6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0ff8e97fe79c8d0c0a46d85f8098563ed47c8e4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b48bd7a1871a358157580d779b82d6b4ac05ed30 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f395921a0ae421ca109d6bfe9fdd33265433667 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b428277b8633b7fe3b14d77cc199b535cd8f5d3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a16107246ae9ad197ab8382bbd267ea5de6c21a7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eed522b5503518660e56ce40bb27048d31d4771 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..947f3adbef1c50436c643664999c0da9ebe5e228 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..255ce52b4da57bc2ed2cff9ecb1a3f926a6e3f86 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52d3ebcf37505017b18af7de672b080cade7574c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..021aee2fb906f55bce1925ddbe3ddb0f7b9c7857 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60e549b69bfd84e41b4f3a9ad1aeb4ee8600fcfb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58c63ff3a1c276bb2946f9aee2c4b24857cc1df0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..627446d3b1c8e9d6eb9687ec60d45a183acb981b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70327d693f2d28a39fb12d70631648429b2d8077 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90b6b84234ea636b8d8db103c26e4232d3fc4ab4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/asyn_wrapper.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/asyn_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..f58b0b612f3f0a62f2704dc6e73632eaf1b652af --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/asyn_wrapper.py @@ -0,0 +1,99 @@ +import asyncio +import functools +import inspect + +from fsspec.asyn import AsyncFileSystem + + +def async_wrapper(func, obj=None): + """ + Wraps a synchronous function to make it awaitable. + + Parameters + ---------- + func : callable + The synchronous function to wrap. + obj : object, optional + The instance to bind the function to, if applicable. + + Returns + ------- + coroutine + An awaitable version of the function. + """ + + @functools.wraps(func) + async def wrapper(*args, **kwargs): + return await asyncio.to_thread(func, *args, **kwargs) + + return wrapper + + +class AsyncFileSystemWrapper(AsyncFileSystem): + """ + A wrapper class to convert a synchronous filesystem into an asynchronous one. + + This class takes an existing synchronous filesystem implementation and wraps all + its methods to provide an asynchronous interface. + + Parameters + ---------- + sync_fs : AbstractFileSystem + The synchronous filesystem instance to wrap. + """ + + def __init__(self, sync_fs, *args, **kwargs): + super().__init__(*args, **kwargs) + self.asynchronous = True + self.sync_fs = sync_fs + self.protocol = self.sync_fs.protocol + self._wrap_all_sync_methods() + + @property + def fsid(self): + return f"async_{self.sync_fs.fsid}" + + def _wrap_all_sync_methods(self): + """ + Wrap all synchronous methods of the underlying filesystem with asynchronous versions. + """ + excluded_methods = {"open"} + for method_name in dir(self.sync_fs): + if method_name.startswith("_") or method_name in excluded_methods: + continue + + attr = inspect.getattr_static(self.sync_fs, method_name) + if isinstance(attr, property): + continue + + method = getattr(self.sync_fs, method_name) + if callable(method) and not asyncio.iscoroutinefunction(method): + async_method = async_wrapper(method, obj=self) + setattr(self, f"_{method_name}", async_method) + + @classmethod + def wrap_class(cls, sync_fs_class): + """ + Create a new class that can be used to instantiate an AsyncFileSystemWrapper + with lazy instantiation of the underlying synchronous filesystem. + + Parameters + ---------- + sync_fs_class : type + The class of the synchronous filesystem to wrap. + + Returns + ------- + type + A new class that wraps the provided synchronous filesystem class. + """ + + class GeneratedAsyncFileSystemWrapper(cls): + def __init__(self, *args, **kwargs): + sync_fs = sync_fs_class(*args, **kwargs) + super().__init__(sync_fs) + + GeneratedAsyncFileSystemWrapper.__name__ = ( + f"Async{sync_fs_class.__name__}Wrapper" + ) + return GeneratedAsyncFileSystemWrapper diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..6e7c7d88afdddf12f77b26bb635bd8bf1e2bd7f1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import abc +import hashlib + +from fsspec.implementations.local import make_path_posix + + +class AbstractCacheMapper(abc.ABC): + """Abstract super-class for mappers from remote URLs to local cached + basenames. + """ + + @abc.abstractmethod + def __call__(self, path: str) -> str: ... + + def __eq__(self, other: object) -> bool: + # Identity only depends on class. When derived classes have attributes + # they will need to be included. + return isinstance(other, type(self)) + + def __hash__(self) -> int: + # Identity only depends on class. When derived classes have attributes + # they will need to be included. + return hash(type(self)) + + +class BasenameCacheMapper(AbstractCacheMapper): + """Cache mapper that uses the basename of the remote URL and a fixed number + of directory levels above this. + + The default is zero directory levels, meaning different paths with the same + basename will have the same cached basename. + """ + + def __init__(self, directory_levels: int = 0): + if directory_levels < 0: + raise ValueError( + "BasenameCacheMapper requires zero or positive directory_levels" + ) + self.directory_levels = directory_levels + + # Separator for directories when encoded as strings. + self._separator = "_@_" + + def __call__(self, path: str) -> str: + path = make_path_posix(path) + prefix, *bits = path.rsplit("/", self.directory_levels + 1) + if bits: + return self._separator.join(bits) + else: + return prefix # No separator found, simple filename + + def __eq__(self, other: object) -> bool: + return super().__eq__(other) and self.directory_levels == other.directory_levels + + def __hash__(self) -> int: + return super().__hash__() ^ hash(self.directory_levels) + + +class HashCacheMapper(AbstractCacheMapper): + """Cache mapper that uses a hash of the remote URL.""" + + def __call__(self, path: str) -> str: + return hashlib.sha256(path.encode()).hexdigest() + + +def create_cache_mapper(same_names: bool) -> AbstractCacheMapper: + """Factory method to create cache mapper for backward compatibility with + ``CachingFileSystem`` constructor using ``same_names`` kwarg. + """ + if same_names: + return BasenameCacheMapper() + else: + return HashCacheMapper() diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..bd9b5cdd99d7f4a0a989c0f7d0c70ddcf324816a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import os +import pickle +import time +from typing import TYPE_CHECKING + +from fsspec.utils import atomic_write + +try: + import ujson as json +except ImportError: + if not TYPE_CHECKING: + import json + +if TYPE_CHECKING: + from typing import Any, Dict, Iterator, Literal + + from typing_extensions import TypeAlias + + from .cached import CachingFileSystem + + Detail: TypeAlias = Dict[str, Any] + + +class CacheMetadata: + """Cache metadata. + + All reading and writing of cache metadata is performed by this class, + accessing the cached files and blocks is not. + + Metadata is stored in a single file per storage directory in JSON format. + For backward compatibility, also reads metadata stored in pickle format + which is converted to JSON when next saved. + """ + + def __init__(self, storage: list[str]): + """ + + Parameters + ---------- + storage: list[str] + Directories containing cached files, must be at least one. Metadata + is stored in the last of these directories by convention. + """ + if not storage: + raise ValueError("CacheMetadata expects at least one storage location") + + self._storage = storage + self.cached_files: list[Detail] = [{}] + + # Private attribute to force saving of metadata in pickle format rather than + # JSON for use in tests to confirm can read both pickle and JSON formats. + self._force_save_pickle = False + + def _load(self, fn: str) -> Detail: + """Low-level function to load metadata from specific file""" + try: + with open(fn, "r") as f: + loaded = json.load(f) + except ValueError: + with open(fn, "rb") as f: + loaded = pickle.load(f) + for c in loaded.values(): + if isinstance(c.get("blocks"), list): + c["blocks"] = set(c["blocks"]) + return loaded + + def _save(self, metadata_to_save: Detail, fn: str) -> None: + """Low-level function to save metadata to specific file""" + if self._force_save_pickle: + with atomic_write(fn) as f: + pickle.dump(metadata_to_save, f) + else: + with atomic_write(fn, mode="w") as f: + json.dump(metadata_to_save, f) + + def _scan_locations( + self, writable_only: bool = False + ) -> Iterator[tuple[str, str, bool]]: + """Yield locations (filenames) where metadata is stored, and whether + writable or not. + + Parameters + ---------- + writable: bool + Set to True to only yield writable locations. + + Returns + ------- + Yields (str, str, bool) + """ + n = len(self._storage) + for i, storage in enumerate(self._storage): + writable = i == n - 1 + if writable_only and not writable: + continue + yield os.path.join(storage, "cache"), storage, writable + + def check_file( + self, path: str, cfs: CachingFileSystem | None + ) -> Literal[False] | tuple[Detail, str]: + """If path is in cache return its details, otherwise return ``False``. + + If the optional CachingFileSystem is specified then it is used to + perform extra checks to reject possible matches, such as if they are + too old. + """ + for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files): + if path not in cache: + continue + detail = cache[path].copy() + + if cfs is not None: + if cfs.check_files and detail["uid"] != cfs.fs.ukey(path): + # Wrong file as determined by hash of file properties + continue + if cfs.expiry and time.time() - detail["time"] > cfs.expiry: + # Cached file has expired + continue + + fn = os.path.join(base, detail["fn"]) + if os.path.exists(fn): + return detail, fn + return False + + def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]: + """Remove expired metadata from the cache. + + Returns names of files corresponding to expired metadata and a boolean + flag indicating whether the writable cache is empty. Caller is + responsible for deleting the expired files. + """ + expired_files = [] + for path, detail in self.cached_files[-1].copy().items(): + if time.time() - detail["time"] > expiry_time: + fn = detail.get("fn", "") + if not fn: + raise RuntimeError( + f"Cache metadata does not contain 'fn' for {path}" + ) + fn = os.path.join(self._storage[-1], fn) + expired_files.append(fn) + self.cached_files[-1].pop(path) + + if self.cached_files[-1]: + cache_path = os.path.join(self._storage[-1], "cache") + self._save(self.cached_files[-1], cache_path) + + writable_cache_empty = not self.cached_files[-1] + return expired_files, writable_cache_empty + + def load(self) -> None: + """Load all metadata from disk and store in ``self.cached_files``""" + cached_files = [] + for fn, _, _ in self._scan_locations(): + if os.path.exists(fn): + # TODO: consolidate blocks here + cached_files.append(self._load(fn)) + else: + cached_files.append({}) + self.cached_files = cached_files or [{}] + + def on_close_cached_file(self, f: Any, path: str) -> None: + """Perform side-effect actions on closing a cached file. + + The actual closing of the file is the responsibility of the caller. + """ + # File must be writeble, so in self.cached_files[-1] + c = self.cached_files[-1][path] + if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size: + c["blocks"] = True + + def pop_file(self, path: str) -> str | None: + """Remove metadata of cached file. + + If path is in the cache, return the filename of the cached file, + otherwise return ``None``. Caller is responsible for deleting the + cached file. + """ + details = self.check_file(path, None) + if not details: + return None + _, fn = details + if fn.startswith(self._storage[-1]): + self.cached_files[-1].pop(path) + self.save() + else: + raise PermissionError( + "Can only delete cached file in last, writable cache location" + ) + return fn + + def save(self) -> None: + """Save metadata to disk""" + for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files): + if not writable: + continue + + if os.path.exists(fn): + cached_files = self._load(fn) + for k, c in cached_files.items(): + if k in cache: + if c["blocks"] is True or cache[k]["blocks"] is True: + c["blocks"] = True + else: + # self.cached_files[*][*]["blocks"] must continue to + # point to the same set object so that updates + # performed by MMapCache are propagated back to + # self.cached_files. + blocks = cache[k]["blocks"] + blocks.update(c["blocks"]) + c["blocks"] = blocks + c["time"] = max(c["time"], cache[k]["time"]) + c["uid"] = cache[k]["uid"] + + # Files can be added to cache after it was written once + for k, c in cache.items(): + if k not in cached_files: + cached_files[k] = c + else: + cached_files = cache + cache = {k: v.copy() for k, v in cached_files.items()} + for c in cache.values(): + if isinstance(c["blocks"], set): + c["blocks"] = list(c["blocks"]) + self._save(cache, fn) + self.cached_files[-1] = cached_files + + def update_file(self, path: str, detail: Detail) -> None: + """Update metadata for specific file in memory, do not save""" + self.cached_files[-1][path] = detail diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/cached.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/cached.py new file mode 100644 index 0000000000000000000000000000000000000000..8b8b832953a08ee2e64c11fb0f97eca48c7020ee --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/cached.py @@ -0,0 +1,929 @@ +from __future__ import annotations + +import inspect +import logging +import os +import tempfile +import time +import weakref +from shutil import rmtree +from typing import TYPE_CHECKING, Any, Callable, ClassVar + +from fsspec import AbstractFileSystem, filesystem +from fsspec.callbacks import DEFAULT_CALLBACK +from fsspec.compression import compr +from fsspec.core import BaseCache, MMapCache +from fsspec.exceptions import BlocksizeMismatchError +from fsspec.implementations.cache_mapper import create_cache_mapper +from fsspec.implementations.cache_metadata import CacheMetadata +from fsspec.spec import AbstractBufferedFile +from fsspec.transaction import Transaction +from fsspec.utils import infer_compression + +if TYPE_CHECKING: + from fsspec.implementations.cache_mapper import AbstractCacheMapper + +logger = logging.getLogger("fsspec.cached") + + +class WriteCachedTransaction(Transaction): + def complete(self, commit=True): + rpaths = [f.path for f in self.files] + lpaths = [f.fn for f in self.files] + if commit: + self.fs.put(lpaths, rpaths) + self.files.clear() + self.fs._intrans = False + self.fs._transaction = None + self.fs = None # break cycle + + +class CachingFileSystem(AbstractFileSystem): + """Locally caching filesystem, layer over any other FS + + This class implements chunk-wise local storage of remote files, for quick + access after the initial download. The files are stored in a given + directory with hashes of URLs for the filenames. If no directory is given, + a temporary one is used, which should be cleaned up by the OS after the + process ends. The files themselves are sparse (as implemented in + :class:`~fsspec.caching.MMapCache`), so only the data which is accessed + takes up space. + + Restrictions: + + - the block-size must be the same for each access of a given file, unless + all blocks of the file have already been read + - caching can only be applied to file-systems which produce files + derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also + allowed, for testing + """ + + protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached") + + def __init__( + self, + target_protocol=None, + cache_storage="TMP", + cache_check=10, + check_files=False, + expiry_time=604800, + target_options=None, + fs=None, + same_names: bool | None = None, + compression=None, + cache_mapper: AbstractCacheMapper | None = None, + **kwargs, + ): + """ + + Parameters + ---------- + target_protocol: str (optional) + Target filesystem protocol. Provide either this or ``fs``. + cache_storage: str or list(str) + Location to store files. If "TMP", this is a temporary directory, + and will be cleaned up by the OS when this process ends (or later). + If a list, each location will be tried in the order given, but + only the last will be considered writable. + cache_check: int + Number of seconds between reload of cache metadata + check_files: bool + Whether to explicitly see if the UID of the remote file matches + the stored one before using. Warning: some file systems such as + HTTP cannot reliably give a unique hash of the contents of some + path, so be sure to set this option to False. + expiry_time: int + The time in seconds after which a local copy is considered useless. + Set to falsy to prevent expiry. The default is equivalent to one + week. + target_options: dict or None + Passed to the instantiation of the FS, if fs is None. + fs: filesystem instance + The target filesystem to run against. Provide this or ``protocol``. + same_names: bool (optional) + By default, target URLs are hashed using a ``HashCacheMapper`` so + that files from different backends with the same basename do not + conflict. If this argument is ``true``, a ``BasenameCacheMapper`` + is used instead. Other cache mapper options are available by using + the ``cache_mapper`` keyword argument. Only one of this and + ``cache_mapper`` should be specified. + compression: str (optional) + To decompress on download. Can be 'infer' (guess from the URL name), + one of the entries in ``fsspec.compression.compr``, or None for no + decompression. + cache_mapper: AbstractCacheMapper (optional) + The object use to map from original filenames to cached filenames. + Only one of this and ``same_names`` should be specified. + """ + super().__init__(**kwargs) + if fs is None and target_protocol is None: + raise ValueError( + "Please provide filesystem instance(fs) or target_protocol" + ) + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Both filesystems (fs) and target_protocol may not be both given." + ) + if cache_storage == "TMP": + tempdir = tempfile.mkdtemp() + storage = [tempdir] + weakref.finalize(self, self._remove_tempdir, tempdir) + else: + if isinstance(cache_storage, str): + storage = [cache_storage] + else: + storage = cache_storage + os.makedirs(storage[-1], exist_ok=True) + self.storage = storage + self.kwargs = target_options or {} + self.cache_check = cache_check + self.check_files = check_files + self.expiry = expiry_time + self.compression = compression + + # Size of cache in bytes. If None then the size is unknown and will be + # recalculated the next time cache_size() is called. On writes to the + # cache this is reset to None. + self._cache_size = None + + if same_names is not None and cache_mapper is not None: + raise ValueError( + "Cannot specify both same_names and cache_mapper in " + "CachingFileSystem.__init__" + ) + if cache_mapper is not None: + self._mapper = cache_mapper + else: + self._mapper = create_cache_mapper( + same_names if same_names is not None else False + ) + + self.target_protocol = ( + target_protocol + if isinstance(target_protocol, str) + else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]) + ) + self._metadata = CacheMetadata(self.storage) + self.load_cache() + self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs) + + def _strip_protocol(path): + # acts as a method, since each instance has a difference target + return self.fs._strip_protocol(type(self)._strip_protocol(path)) + + self._strip_protocol: Callable = _strip_protocol + + @staticmethod + def _remove_tempdir(tempdir): + try: + rmtree(tempdir) + except Exception: + pass + + def _mkcache(self): + os.makedirs(self.storage[-1], exist_ok=True) + + def cache_size(self): + """Return size of cache in bytes. + + If more than one cache directory is in use, only the size of the last + one (the writable cache directory) is returned. + """ + if self._cache_size is None: + cache_dir = self.storage[-1] + self._cache_size = filesystem("file").du(cache_dir, withdirs=True) + return self._cache_size + + def load_cache(self): + """Read set of stored blocks from file""" + self._metadata.load() + self._mkcache() + self.last_cache = time.time() + + def save_cache(self): + """Save set of stored blocks from file""" + self._mkcache() + self._metadata.save() + self.last_cache = time.time() + self._cache_size = None + + def _check_cache(self): + """Reload caches if time elapsed or any disappeared""" + self._mkcache() + if not self.cache_check: + # explicitly told not to bother checking + return + timecond = time.time() - self.last_cache > self.cache_check + existcond = all(os.path.exists(storage) for storage in self.storage) + if timecond or not existcond: + self.load_cache() + + def _check_file(self, path): + """Is path in cache and still valid""" + path = self._strip_protocol(path) + self._check_cache() + return self._metadata.check_file(path, self) + + def clear_cache(self): + """Remove all files and metadata from the cache + + In the case of multiple cache locations, this clears only the last one, + which is assumed to be the read/write one. + """ + rmtree(self.storage[-1]) + self.load_cache() + self._cache_size = None + + def clear_expired_cache(self, expiry_time=None): + """Remove all expired files and metadata from the cache + + In the case of multiple cache locations, this clears only the last one, + which is assumed to be the read/write one. + + Parameters + ---------- + expiry_time: int + The time in seconds after which a local copy is considered useless. + If not defined the default is equivalent to the attribute from the + file caching instantiation. + """ + + if not expiry_time: + expiry_time = self.expiry + + self._check_cache() + + expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time) + for fn in expired_files: + if os.path.exists(fn): + os.remove(fn) + + if writable_cache_empty: + rmtree(self.storage[-1]) + self.load_cache() + + self._cache_size = None + + def pop_from_cache(self, path): + """Remove cached version of given file + + Deletes local copy of the given (remote) path. If it is found in a cache + location which is not the last, it is assumed to be read-only, and + raises PermissionError + """ + path = self._strip_protocol(path) + fn = self._metadata.pop_file(path) + if fn is not None: + os.remove(fn) + self._cache_size = None + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Wrap the target _open + + If the whole file exists in the cache, just open it locally and + return that. + + Otherwise, open the file on the target FS, and make it have a mmap + cache pointing to the location which we determine, in our cache. + The ``blocks`` instance is shared, so as the mmap cache instance + updates, so does the entry in our ``cached_files`` attribute. + We monkey-patch this file, so that when it closes, we call + ``close_and_update`` to save the state of the blocks. + """ + path = self._strip_protocol(path) + + path = self.fs._strip_protocol(path) + if "r" not in mode: + return self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + detail = self._check_file(path) + if detail: + # file is in cache + detail, fn = detail + hash, blocks = detail["fn"], detail["blocks"] + if blocks is True: + # stored file is complete + logger.debug("Opening local copy of %s", path) + return open(fn, mode) + # TODO: action where partial file exists in read-only cache + logger.debug("Opening partially cached copy of %s", path) + else: + hash = self._mapper(path) + fn = os.path.join(self.storage[-1], hash) + blocks = set() + detail = { + "original": path, + "fn": hash, + "blocks": blocks, + "time": time.time(), + "uid": self.fs.ukey(path), + } + self._metadata.update_file(path, detail) + logger.debug("Creating local sparse file for %s", path) + + # call target filesystems open + self._mkcache() + f = self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + cache_type="none", + **kwargs, + ) + if self.compression: + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + if "blocksize" in detail: + if detail["blocksize"] != f.blocksize: + raise BlocksizeMismatchError( + f"Cached file must be reopened with same block" + f" size as original (old: {detail['blocksize']}," + f" new {f.blocksize})" + ) + else: + detail["blocksize"] = f.blocksize + f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) + close = f.close + f.close = lambda: self.close_and_update(f, close) + self.save_cache() + return f + + def _parent(self, path): + return self.fs._parent(path) + + def hash_name(self, path: str, *args: Any) -> str: + # Kept for backward compatibility with downstream libraries. + # Ignores extra arguments, previously same_name boolean. + return self._mapper(path) + + def close_and_update(self, f, close): + """Called when a file is closing, so store the set of blocks""" + if f.closed: + return + path = self._strip_protocol(f.path) + self._metadata.on_close_cached_file(f, path) + try: + logger.debug("going to save") + self.save_cache() + logger.debug("saved") + except OSError: + logger.debug("Cache saving failed while closing file") + except NameError: + logger.debug("Cache save failed due to interpreter shutdown") + close() + f.closed = True + + def ls(self, path, detail=True): + return self.fs.ls(path, detail) + + def __getattribute__(self, item): + if item in { + "load_cache", + "_open", + "save_cache", + "close_and_update", + "__init__", + "__getattribute__", + "__reduce__", + "_make_local_details", + "open", + "cat", + "cat_file", + "cat_ranges", + "get", + "read_block", + "tail", + "head", + "info", + "ls", + "exists", + "isfile", + "isdir", + "_check_file", + "_check_cache", + "_mkcache", + "clear_cache", + "clear_expired_cache", + "pop_from_cache", + "local_file", + "_paths_from_path", + "get_mapper", + "open_many", + "commit_many", + "hash_name", + "__hash__", + "__eq__", + "to_json", + "to_dict", + "cache_size", + "pipe_file", + "pipe", + "start_transaction", + "end_transaction", + }: + # all the methods defined in this class. Note `open` here, since + # it calls `_open`, but is actually in superclass + return lambda *args, **kw: getattr(type(self), item).__get__(self)( + *args, **kw + ) + if item in ["__reduce_ex__"]: + raise AttributeError + if item in ["transaction"]: + # property + return type(self).transaction.__get__(self) + if item in ["_cache", "transaction_type"]: + # class attributes + return getattr(type(self), item) + if item == "__class__": + return type(self) + d = object.__getattribute__(self, "__dict__") + fs = d.get("fs", None) # fs is not immediately defined + if item in d: + return d[item] + elif fs is not None: + if item in fs.__dict__: + # attribute of instance + return fs.__dict__[item] + # attributed belonging to the target filesystem + cls = type(fs) + m = getattr(cls, item) + if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and ( + not hasattr(m, "__self__") or m.__self__ is None + ): + # instance method + return m.__get__(fs, cls) + return m # class method or attribute + else: + # attributes of the superclass, while target is being set up + return super().__getattribute__(item) + + def __eq__(self, other): + """Test for equality.""" + if self is other: + return True + if not isinstance(other, type(self)): + return False + return ( + self.storage == other.storage + and self.kwargs == other.kwargs + and self.cache_check == other.cache_check + and self.check_files == other.check_files + and self.expiry == other.expiry + and self.compression == other.compression + and self._mapper == other._mapper + and self.target_protocol == other.target_protocol + ) + + def __hash__(self): + """Calculate hash.""" + return ( + hash(tuple(self.storage)) + ^ hash(str(self.kwargs)) + ^ hash(self.cache_check) + ^ hash(self.check_files) + ^ hash(self.expiry) + ^ hash(self.compression) + ^ hash(self._mapper) + ^ hash(self.target_protocol) + ) + + +class WholeFileCacheFileSystem(CachingFileSystem): + """Caches whole remote files on first access + + This class is intended as a layer over any other file system, and + will make a local copy of each file accessed, so that all subsequent + reads are local. This is similar to ``CachingFileSystem``, but without + the block-wise functionality and so can work even when sparse files + are not allowed. See its docstring for definition of the init + arguments. + + The class still needs access to the remote store for listing files, + and may refresh cached files. + """ + + protocol = "filecache" + local_file = True + + def open_many(self, open_files, **kwargs): + paths = [of.path for of in open_files] + if "r" in open_files.mode: + self._mkcache() + else: + return [ + LocalTempFile( + self.fs, + path, + mode=open_files.mode, + fn=os.path.join(self.storage[-1], self._mapper(path)), + **kwargs, + ) + for path in paths + ] + + if self.compression: + raise NotImplementedError + details = [self._check_file(sp) for sp in paths] + downpath = [p for p, d in zip(paths, details) if not d] + downfn0 = [ + os.path.join(self.storage[-1], self._mapper(p)) + for p, d in zip(paths, details) + ] # keep these path names for opening later + downfn = [fn for fn, d in zip(downfn0, details) if not d] + if downpath: + # skip if all files are already cached and up to date + self.fs.get(downpath, downfn) + + # update metadata - only happens when downloads are successful + newdetail = [ + { + "original": path, + "fn": self._mapper(path), + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + for path in downpath + ] + for path, detail in zip(downpath, newdetail): + self._metadata.update_file(path, detail) + self.save_cache() + + def firstpart(fn): + # helper to adapt both whole-file and simple-cache + return fn[1] if isinstance(fn, tuple) else fn + + return [ + open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode) + for fn0, fn1 in zip(details, downfn0) + ] + + def commit_many(self, open_files): + self.fs.put([f.fn for f in open_files], [f.path for f in open_files]) + [f.close() for f in open_files] + for f in open_files: + # in case autocommit is off, and so close did not already delete + try: + os.remove(f.name) + except FileNotFoundError: + pass + self._cache_size = None + + def _make_local_details(self, path): + hash = self._mapper(path) + fn = os.path.join(self.storage[-1], hash) + detail = { + "original": path, + "fn": hash, + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + self._metadata.update_file(path, detail) + logger.debug("Copying %s to local cache", path) + return fn + + def cat( + self, + path, + recursive=False, + on_error="raise", + callback=DEFAULT_CALLBACK, + **kwargs, + ): + paths = self.expand_path( + path, recursive=recursive, maxdepth=kwargs.get("maxdepth") + ) + getpaths = [] + storepaths = [] + fns = [] + out = {} + for p in paths.copy(): + try: + detail = self._check_file(p) + if not detail: + fn = self._make_local_details(p) + getpaths.append(p) + storepaths.append(fn) + else: + detail, fn = detail if isinstance(detail, tuple) else (None, detail) + fns.append(fn) + except Exception as e: + if on_error == "raise": + raise + if on_error == "return": + out[p] = e + paths.remove(p) + + if getpaths: + self.fs.get(getpaths, storepaths) + self.save_cache() + + callback.set_size(len(paths)) + for p, fn in zip(paths, fns): + with open(fn, "rb") as f: + out[p] = f.read() + callback.relative_update(1) + if isinstance(path, str) and len(paths) == 1 and recursive is False: + out = out[paths[0]] + return out + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + if "r" not in mode: + hash = self._mapper(path) + fn = os.path.join(self.storage[-1], hash) + user_specified_kwargs = { + k: v + for k, v in kwargs.items() + # those kwargs were added by open(), we don't want them + if k not in ["autocommit", "block_size", "cache_options"] + } + return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs) + detail = self._check_file(path) + if detail: + detail, fn = detail + _, blocks = detail["fn"], detail["blocks"] + if blocks is True: + logger.debug("Opening local copy of %s", path) + + # In order to support downstream filesystems to be able to + # infer the compression from the original filename, like + # the `TarFileSystem`, let's extend the `io.BufferedReader` + # fileobject protocol by adding a dedicated attribute + # `original`. + f = open(fn, mode) + f.original = detail.get("original") + return f + else: + raise ValueError( + f"Attempt to open partially cached file {path}" + f" as a wholly cached file" + ) + else: + fn = self._make_local_details(path) + kwargs["mode"] = mode + + # call target filesystems open + self._mkcache() + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + data = True + while data: + block = getattr(f, "blocksize", 5 * 2**20) + data = f.read(block) + f2.write(data) + else: + self.fs.get_file(path, fn) + self.save_cache() + return self._open(path, mode) + + +class SimpleCacheFileSystem(WholeFileCacheFileSystem): + """Caches whole remote files on first access + + This class is intended as a layer over any other file system, and + will make a local copy of each file accessed, so that all subsequent + reads are local. This implementation only copies whole files, and + does not keep any metadata about the download time or file details. + It is therefore safer to use in multi-threaded/concurrent situations. + + This is the only of the caching filesystems that supports write: you will + be given a real local open file, and upon close and commit, it will be + uploaded to the target filesystem; the writability or the target URL is + not checked until that time. + + """ + + protocol = "simplecache" + local_file = True + transaction_type = WriteCachedTransaction + + def __init__(self, **kwargs): + kw = kwargs.copy() + for key in ["cache_check", "expiry_time", "check_files"]: + kw[key] = False + super().__init__(**kw) + for storage in self.storage: + if not os.path.exists(storage): + os.makedirs(storage, exist_ok=True) + + def _check_file(self, path): + self._check_cache() + sha = self._mapper(path) + for storage in self.storage: + fn = os.path.join(storage, sha) + if os.path.exists(fn): + return fn + + def save_cache(self): + pass + + def load_cache(self): + pass + + def pipe_file(self, path, value=None, **kwargs): + if self._intrans: + with self.open(path, "wb") as f: + f.write(value) + else: + super().pipe_file(path, value) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + details = [] + try: + details = self.fs.ls( + path, detail=True, **kwargs + ).copy() # don't edit original! + except FileNotFoundError as e: + ex = e + else: + ex = None + if self._intrans: + path1 = path.rstrip("/") + "/" + for f in self.transaction.files: + if f.path == path: + details.append( + {"name": path, "size": f.size or f.tell(), "type": "file"} + ) + elif f.path.startswith(path1): + if f.path.count("/") == path1.count("/"): + details.append( + {"name": f.path, "size": f.size or f.tell(), "type": "file"} + ) + else: + dname = "/".join(f.path.split("/")[: path1.count("/") + 1]) + details.append({"name": dname, "size": 0, "type": "directory"}) + if ex is not None and not details: + raise ex + if detail: + return details + return sorted(_["name"] for _ in details) + + def info(self, path, **kwargs): + path = self._strip_protocol(path) + if self._intrans: + f = [_ for _ in self.transaction.files if _.path == path] + if f: + size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell() + return {"name": path, "size": size, "type": "file"} + f = any(_.path.startswith(path + "/") for _ in self.transaction.files) + if f: + return {"name": path, "size": 0, "type": "directory"} + return self.fs.info(path, **kwargs) + + def pipe(self, path, value=None, **kwargs): + if isinstance(path, str): + self.pipe_file(self._strip_protocol(path), value, **kwargs) + elif isinstance(path, dict): + for k, v in path.items(): + self.pipe_file(self._strip_protocol(k), v, **kwargs) + else: + raise ValueError("path must be str or dict") + + def cat_ranges( + self, paths, starts, ends, max_gap=None, on_error="return", **kwargs + ): + lpaths = [self._check_file(p) for p in paths] + rpaths = [p for l, p in zip(lpaths, paths) if l is False] + lpaths = [l for l, p in zip(lpaths, paths) if l is False] + self.fs.get(rpaths, lpaths) + return super().cat_ranges( + paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs + ) + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + sha = self._mapper(path) + + if "r" not in mode: + fn = os.path.join(self.storage[-1], sha) + user_specified_kwargs = { + k: v + for k, v in kwargs.items() + if k not in ["autocommit", "block_size", "cache_options"] + } # those were added by open() + return LocalTempFile( + self, + path, + mode=mode, + autocommit=not self._intrans, + fn=fn, + **user_specified_kwargs, + ) + fn = self._check_file(path) + if fn: + return open(fn, mode) + + fn = os.path.join(self.storage[-1], sha) + logger.debug("Copying %s to local cache", path) + kwargs["mode"] = mode + + self._mkcache() + self._cache_size = None + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + data = True + while data: + block = getattr(f, "blocksize", 5 * 2**20) + data = f.read(block) + f2.write(data) + else: + self.fs.get_file(path, fn) + return self._open(path, mode) + + +class LocalTempFile: + """A temporary local file, which will be uploaded on commit""" + + def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs): + self.fn = fn + self.fh = open(fn, mode) + self.mode = mode + if seek: + self.fh.seek(seek) + self.path = path + self.size = None + self.fs = fs + self.closed = False + self.autocommit = autocommit + self.kwargs = kwargs + + def __reduce__(self): + # always open in r+b to allow continuing writing at a location + return ( + LocalTempFile, + (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()), + ) + + def __enter__(self): + return self.fh + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + # self.size = self.fh.tell() + if self.closed: + return + self.fh.close() + self.closed = True + if self.autocommit: + self.commit() + + def discard(self): + self.fh.close() + os.remove(self.fn) + + def commit(self): + self.fs.put(self.fn, self.path, **self.kwargs) + # we do not delete local copy - it's still in the cache + + @property + def name(self): + return self.fn + + def __repr__(self) -> str: + return f"LocalTempFile: {self.path}" + + def __getattr__(self, item): + return getattr(self.fh, item) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/data.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/data.py new file mode 100644 index 0000000000000000000000000000000000000000..519032305bed633f2ba8a6148076433caf81710b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/data.py @@ -0,0 +1,58 @@ +import base64 +import io +from typing import Optional +from urllib.parse import unquote + +from fsspec import AbstractFileSystem + + +class DataFileSystem(AbstractFileSystem): + """A handy decoder for data-URLs + + Example + ------- + >>> with fsspec.open("data:,Hello%2C%20World%21") as f: + ... print(f.read()) + b"Hello, World!" + + See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs + """ + + protocol = "data" + + def __init__(self, **kwargs): + """No parameters for this filesystem""" + super().__init__(**kwargs) + + def cat_file(self, path, start=None, end=None, **kwargs): + pref, data = path.split(",", 1) + if pref.endswith("base64"): + return base64.b64decode(data)[start:end] + return unquote(data).encode()[start:end] + + def info(self, path, **kwargs): + pref, name = path.split(",", 1) + data = self.cat_file(path) + mime = pref.split(":", 1)[1].split(";", 1)[0] + return {"name": name, "size": len(data), "type": "file", "mimetype": mime} + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + if "r" not in mode: + raise ValueError("Read only filesystem") + return io.BytesIO(self.cat_file(path)) + + @staticmethod + def encode(data: bytes, mime: Optional[str] = None): + """Format the given data into data-URL syntax + + This version always base64 encodes, even when the data is ascii/url-safe. + """ + return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}" diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/ftp.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/ftp.py new file mode 100644 index 0000000000000000000000000000000000000000..4186b6ce3ddff4d74151264bc6a0315b5818ce64 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/ftp.py @@ -0,0 +1,395 @@ +import os +import sys +import uuid +import warnings +from ftplib import FTP, FTP_TLS, Error, error_perm +from typing import Any + +from ..spec import AbstractBufferedFile, AbstractFileSystem +from ..utils import infer_storage_options, isfilelike + + +class FTPFileSystem(AbstractFileSystem): + """A filesystem over classic FTP""" + + root_marker = "/" + cachable = False + protocol = "ftp" + + def __init__( + self, + host, + port=21, + username=None, + password=None, + acct=None, + block_size=None, + tempdir=None, + timeout=30, + encoding="utf-8", + tls=False, + **kwargs, + ): + """ + You can use _get_kwargs_from_urls to get some kwargs from + a reasonable FTP url. + + Authentication will be anonymous if username/password are not + given. + + Parameters + ---------- + host: str + The remote server name/ip to connect to + port: int + Port to connect with + username: str or None + If authenticating, the user's identifier + password: str of None + User's password on the server, if using + acct: str or None + Some servers also need an "account" string for auth + block_size: int or None + If given, the read-ahead or write buffer size. + tempdir: str + Directory on remote to put temporary files when in a transaction + timeout: int + Timeout of the ftp connection in seconds + encoding: str + Encoding to use for directories and filenames in FTP connection + tls: bool + Use FTP-TLS, by default False + """ + super().__init__(**kwargs) + self.host = host + self.port = port + self.tempdir = tempdir or "/tmp" + self.cred = username or "", password or "", acct or "" + self.timeout = timeout + self.encoding = encoding + if block_size is not None: + self.blocksize = block_size + else: + self.blocksize = 2**16 + self.tls = tls + self._connect() + if self.tls: + self.ftp.prot_p() + + def _connect(self): + if self.tls: + ftp_cls = FTP_TLS + else: + ftp_cls = FTP + if sys.version_info >= (3, 9): + self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding) + elif self.encoding: + warnings.warn("`encoding` not supported for python<3.9, ignoring") + self.ftp = ftp_cls(timeout=self.timeout) + else: + self.ftp = ftp_cls(timeout=self.timeout) + self.ftp.connect(self.host, self.port) + self.ftp.login(*self.cred) + + @classmethod + def _strip_protocol(cls, path): + return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + out = [] + if path not in self.dircache: + try: + try: + out = [ + (fn, details) + for (fn, details) in self.ftp.mlsd(path) + if fn not in [".", ".."] + and details["type"] not in ["pdir", "cdir"] + ] + except error_perm: + out = _mlsd2(self.ftp, path) # Not platform independent + for fn, details in out: + details["name"] = "/".join( + ["" if path == "/" else path, fn.lstrip("/")] + ) + if details["type"] == "file": + details["size"] = int(details["size"]) + else: + details["size"] = 0 + if details["type"] == "dir": + details["type"] = "directory" + self.dircache[path] = out + except Error: + try: + info = self.info(path) + if info["type"] == "file": + out = [(path, info)] + except (Error, IndexError) as exc: + raise FileNotFoundError(path) from exc + files = self.dircache.get(path, out) + if not detail: + return sorted([fn for fn, details in files]) + return [details for fn, details in files] + + def info(self, path, **kwargs): + # implement with direct method + path = self._strip_protocol(path) + if path == "/": + # special case, since this dir has no real entry + return {"name": "/", "size": 0, "type": "directory"} + files = self.ls(self._parent(path).lstrip("/"), True) + try: + out = next(f for f in files if f["name"] == path) + except StopIteration as exc: + raise FileNotFoundError(path) from exc + return out + + def get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + if not os.path.exists(lpath): + os.mkdir(lpath) + return + if isfilelike(lpath): + outfile = lpath + else: + outfile = open(lpath, "wb") + + def cb(x): + outfile.write(x) + + self.ftp.retrbinary( + f"RETR {rpath}", + blocksize=self.blocksize, + callback=cb, + ) + if not isfilelike(lpath): + outfile.close() + + def cat_file(self, path, start=None, end=None, **kwargs): + if end is not None: + return super().cat_file(path, start, end, **kwargs) + out = [] + + def cb(x): + out.append(x) + + try: + self.ftp.retrbinary( + f"RETR {path}", + blocksize=self.blocksize, + rest=start, + callback=cb, + ) + except (Error, error_perm) as orig_exc: + raise FileNotFoundError(path) from orig_exc + return b"".join(out) + + def _open( + self, + path, + mode="rb", + block_size=None, + cache_options=None, + autocommit=True, + **kwargs, + ): + path = self._strip_protocol(path) + block_size = block_size or self.blocksize + return FTPFile( + self, + path, + mode=mode, + block_size=block_size, + tempdir=self.tempdir, + autocommit=autocommit, + cache_options=cache_options, + ) + + def _rm(self, path): + path = self._strip_protocol(path) + self.ftp.delete(path) + self.invalidate_cache(self._parent(path)) + + def rm(self, path, recursive=False, maxdepth=None): + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(paths): + if self.isfile(p): + self.rm_file(p) + else: + self.rmdir(p) + + def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None: + path = self._strip_protocol(path) + parent = self._parent(path) + if parent != self.root_marker and not self.exists(parent) and create_parents: + self.mkdir(parent, create_parents=create_parents) + + self.ftp.mkd(path) + self.invalidate_cache(self._parent(path)) + + def makedirs(self, path: str, exist_ok: bool = False) -> None: + path = self._strip_protocol(path) + if self.exists(path): + # NB: "/" does not "exist" as it has no directory entry + if not exist_ok: + raise FileExistsError(f"{path} exists without `exist_ok`") + # exists_ok=True -> no-op + else: + self.mkdir(path, create_parents=True) + + def rmdir(self, path): + path = self._strip_protocol(path) + self.ftp.rmd(path) + self.invalidate_cache(self._parent(path)) + + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + self.ftp.rename(path1, path2) + self.invalidate_cache(self._parent(path1)) + self.invalidate_cache(self._parent(path2)) + + def __del__(self): + self.ftp.close() + + def invalidate_cache(self, path=None): + if path is None: + self.dircache.clear() + else: + self.dircache.pop(path, None) + super().invalidate_cache(path) + + +class TransferDone(Exception): + """Internal exception to break out of transfer""" + + pass + + +class FTPFile(AbstractBufferedFile): + """Interact with a remote FTP file with read/write buffering""" + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + **kwargs, + ): + super().__init__( + fs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_type=cache_type, + cache_options=cache_options, + **kwargs, + ) + if not autocommit: + self.target = self.path + self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) + + def commit(self): + self.fs.mv(self.path, self.target) + + def discard(self): + self.fs.rm(self.path) + + def _fetch_range(self, start, end): + """Get bytes between given byte limits + + Implemented by raising an exception in the fetch callback when the + number of bytes received reaches the requested amount. + + Will fail if the server does not respect the REST command on + retrieve requests. + """ + out = [] + total = [0] + + def callback(x): + total[0] += len(x) + if total[0] > end - start: + out.append(x[: (end - start) - total[0]]) + if end < self.size: + raise TransferDone + else: + out.append(x) + + if total[0] == end - start and end < self.size: + raise TransferDone + + try: + self.fs.ftp.retrbinary( + f"RETR {self.path}", + blocksize=self.blocksize, + rest=start, + callback=callback, + ) + except TransferDone: + try: + # stop transfer, we got enough bytes for this block + self.fs.ftp.abort() + self.fs.ftp.getmultiline() + except Error: + self.fs._connect() + + return b"".join(out) + + def _upload_chunk(self, final=False): + self.buffer.seek(0) + self.fs.ftp.storbinary( + f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset + ) + return True + + +def _mlsd2(ftp, path="."): + """ + Fall back to using `dir` instead of `mlsd` if not supported. + + This parses a Linux style `ls -l` response to `dir`, but the response may + be platform dependent. + + Parameters + ---------- + ftp: ftplib.FTP + path: str + Expects to be given path, but defaults to ".". + """ + lines = [] + minfo = [] + ftp.dir(path, lines.append) + for line in lines: + split_line = line.split() + if len(split_line) < 9: + continue + this = ( + split_line[-1], + { + "modify": " ".join(split_line[5:8]), + "unix.owner": split_line[2], + "unix.group": split_line[3], + "unix.mode": split_line[0], + "size": split_line[4], + }, + ) + if this[1]["unix.mode"][0] == "d": + this[1]["type"] = "dir" + else: + this[1]["type"] = "file" + minfo.append(this) + return minfo diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/git.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/git.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9d3539a013935e9a5652605cdd1c21cee0f0ee --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/git.py @@ -0,0 +1,115 @@ +import os + +import pygit2 + +from fsspec.spec import AbstractFileSystem + +from .memory import MemoryFile + + +class GitFileSystem(AbstractFileSystem): + """Browse the files of a local git repo at any hash/tag/branch + + (experimental backend) + """ + + root_marker = "" + cachable = True + + def __init__(self, path=None, fo=None, ref=None, **kwargs): + """ + + Parameters + ---------- + path: str (optional) + Local location of the repo (uses current directory if not given). + May be deprecated in favour of ``fo``. When used with a higher + level function such as fsspec.open(), may be of the form + "git://[path-to-repo[:]][ref@]path/to/file" (but the actual + file path should not contain "@" or ":"). + fo: str (optional) + Same as ``path``, but passed as part of a chained URL. This one + takes precedence if both are given. + ref: str (optional) + Reference to work with, could be a hash, tag or branch name. Defaults + to current working tree. Note that ``ls`` and ``open`` also take hash, + so this becomes the default for those operations + kwargs + """ + super().__init__(**kwargs) + self.repo = pygit2.Repository(fo or path or os.getcwd()) + self.ref = ref or "master" + + @classmethod + def _strip_protocol(cls, path): + path = super()._strip_protocol(path).lstrip("/") + if ":" in path: + path = path.split(":", 1)[1] + if "@" in path: + path = path.split("@", 1)[1] + return path.lstrip("/") + + def _path_to_object(self, path, ref): + comm, ref = self.repo.resolve_refish(ref or self.ref) + parts = path.split("/") + tree = comm.tree + for part in parts: + if part and isinstance(tree, pygit2.Tree): + if part not in tree: + raise FileNotFoundError(path) + tree = tree[part] + return tree + + @staticmethod + def _get_kwargs_from_urls(path): + if path.startswith("git://"): + path = path[6:] + out = {} + if ":" in path: + out["path"], path = path.split(":", 1) + if "@" in path: + out["ref"], path = path.split("@", 1) + return out + + @staticmethod + def _object_to_info(obj, path=None): + # obj.name and obj.filemode are None for the root tree! + is_dir = isinstance(obj, pygit2.Tree) + return { + "type": "directory" if is_dir else "file", + "name": ( + "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name + ), + "hex": str(obj.id), + "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}", + "size": 0 if is_dir else obj.size, + } + + def ls(self, path, detail=True, ref=None, **kwargs): + tree = self._path_to_object(self._strip_protocol(path), ref) + return [ + GitFileSystem._object_to_info(obj, path) + if detail + else GitFileSystem._object_to_info(obj, path)["name"] + for obj in (tree if isinstance(tree, pygit2.Tree) else [tree]) + ] + + def info(self, path, ref=None, **kwargs): + tree = self._path_to_object(self._strip_protocol(path), ref) + return GitFileSystem._object_to_info(tree, path) + + def ukey(self, path, ref=None): + return self.info(path, ref=ref)["hex"] + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + ref=None, + **kwargs, + ): + obj = self._path_to_object(path, ref or self.ref) + return MemoryFile(data=obj.data) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/github.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/github.py new file mode 100644 index 0000000000000000000000000000000000000000..3650b8ebaa4eae3caa75a5290305fefe0a80d30b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/github.py @@ -0,0 +1,239 @@ +import requests + +import fsspec + +from ..spec import AbstractFileSystem +from ..utils import infer_storage_options +from .memory import MemoryFile + +# TODO: add GIST backend, would be very similar + + +class GithubFileSystem(AbstractFileSystem): + """Interface to files in github + + An instance of this class provides the files residing within a remote github + repository. You may specify a point in the repos history, by SHA, branch + or tag (default is current master). + + Given that code files tend to be small, and that github does not support + retrieving partial content, we always fetch whole files. + + When using fsspec.open, allows URIs of the form: + + - "github://path/file", in which case you must specify org, repo and + may specify sha in the extra args + - 'github://org:repo@/precip/catalog.yml', where the org and repo are + part of the URI + - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included + + ``sha`` can be the full or abbreviated hex of the commit you want to fetch + from, or a branch or tag name (so long as it doesn't contain special characters + like "/", "?", which would have to be HTTP-encoded). + + For authorised access, you must provide username and token, which can be made + at https://github.com/settings/tokens + """ + + url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" + rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" + protocol = "github" + timeout = (60, 60) # connect, read timeouts + + def __init__( + self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs + ): + super().__init__(**kwargs) + self.org = org + self.repo = repo + if (username is None) ^ (token is None): + raise ValueError("Auth required both username and token") + self.username = username + self.token = token + if timeout is not None: + self.timeout = timeout + if sha is None: + # look up default branch (not necessarily "master") + u = "https://api.github.com/repos/{org}/{repo}" + r = requests.get( + u.format(org=org, repo=repo), timeout=self.timeout, **self.kw + ) + r.raise_for_status() + sha = r.json()["default_branch"] + + self.root = sha + self.ls("") + + @property + def kw(self): + if self.username: + return {"auth": (self.username, self.token)} + return {} + + @classmethod + def repos(cls, org_or_user, is_org=True): + """List repo names for given org or user + + This may become the top level of the FS + + Parameters + ---------- + org_or_user: str + Name of the github org or user to query + is_org: bool (default True) + Whether the name is an organisation (True) or user (False) + + Returns + ------- + List of string + """ + r = requests.get( + f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos", + timeout=cls.timeout, + ) + r.raise_for_status() + return [repo["name"] for repo in r.json()] + + @property + def tags(self): + """Names of tags in the repo""" + r = requests.get( + f"https://api.github.com/repos/{self.org}/{self.repo}/tags", + timeout=self.timeout, + **self.kw, + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def branches(self): + """Names of branches in the repo""" + r = requests.get( + f"https://api.github.com/repos/{self.org}/{self.repo}/branches", + timeout=self.timeout, + **self.kw, + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def refs(self): + """Named references, tags and branches""" + return {"tags": self.tags, "branches": self.branches} + + def ls(self, path, detail=False, sha=None, _sha=None, **kwargs): + """List files at given path + + Parameters + ---------- + path: str + Location to list, relative to repo root + detail: bool + If True, returns list of dicts, one per file; if False, returns + list of full filenames only + sha: str (optional) + List at the given point in the repo history, branch or tag name or commit + SHA + _sha: str (optional) + List this specific tree object (used internally to descend into trees) + """ + path = self._strip_protocol(path) + if path == "": + _sha = sha or self.root + if _sha is None: + parts = path.rstrip("/").split("/") + so_far = "" + _sha = sha or self.root + for part in parts: + out = self.ls(so_far, True, sha=sha, _sha=_sha) + so_far += "/" + part if so_far else part + out = [o for o in out if o["name"] == so_far] + if not out: + raise FileNotFoundError(path) + out = out[0] + if out["type"] == "file": + if detail: + return [out] + else: + return path + _sha = out["sha"] + if path not in self.dircache or sha not in [self.root, None]: + r = requests.get( + self.url.format(org=self.org, repo=self.repo, sha=_sha), + timeout=self.timeout, + **self.kw, + ) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + types = {"blob": "file", "tree": "directory"} + out = [ + { + "name": path + "/" + f["path"] if path else f["path"], + "mode": f["mode"], + "type": types[f["type"]], + "size": f.get("size", 0), + "sha": f["sha"], + } + for f in r.json()["tree"] + if f["type"] in types + ] + if sha in [self.root, None]: + self.dircache[path] = out + else: + out = self.dircache[path] + if detail: + return out + else: + return sorted([f["name"] for f in out]) + + def invalidate_cache(self, path=None): + self.dircache.clear() + + @classmethod + def _strip_protocol(cls, path): + opts = infer_storage_options(path) + if "username" not in opts: + return super()._strip_protocol(path) + return opts["path"].lstrip("/") + + @staticmethod + def _get_kwargs_from_urls(path): + opts = infer_storage_options(path) + if "username" not in opts: + return {} + out = {"org": opts["username"], "repo": opts["password"]} + if opts["host"]: + out["sha"] = opts["host"] + return out + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + sha=None, + **kwargs, + ): + if mode != "rb": + raise NotImplementedError + url = self.rurl.format( + org=self.org, repo=self.repo, path=path, sha=sha or self.root + ) + r = requests.get(url, timeout=self.timeout, **self.kw) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + return MemoryFile(None, None, r.content) + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + paths = self.expand_path(path, recursive=recursive) + urls = [ + self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root) + for u, sh in paths + ] + fs = fsspec.filesystem("http") + data = fs.cat(urls, on_error="return") + return {u: v for ((k, v), u) in zip(data.items(), urls)} diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/http.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/http.py new file mode 100644 index 0000000000000000000000000000000000000000..9e5d2020353742e2348cd3d998dbe2650f316c6c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/http.py @@ -0,0 +1,856 @@ +import asyncio +import io +import logging +import re +import weakref +from copy import copy +from urllib.parse import urlparse + +import aiohttp +import yarl + +from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper +from fsspec.callbacks import DEFAULT_CALLBACK +from fsspec.exceptions import FSTimeoutError +from fsspec.spec import AbstractBufferedFile +from fsspec.utils import ( + DEFAULT_BLOCK_SIZE, + glob_translate, + isfilelike, + nullcontext, + tokenize, +) + +from ..caching import AllBytes + +# https://stackoverflow.com/a/15926317/3821154 +ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P[^"']+)""") +ex2 = re.compile(r"""(?Phttp[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") +logger = logging.getLogger("fsspec.http") + + +async def get_client(**kwargs): + return aiohttp.ClientSession(**kwargs) + + +class HTTPFileSystem(AsyncFileSystem): + """ + Simple File-System for fetching data via HTTP(S) + + ``ls()`` is implemented by loading the parent page and doing a regex + match on the result. If simple_link=True, anything of the form + "http(s)://server.com/stuff?thing=other"; otherwise only links within + HTML href tags will be used. + """ + + sep = "/" + + def __init__( + self, + simple_links=True, + block_size=None, + same_scheme=True, + size_policy=None, + cache_type="bytes", + cache_options=None, + asynchronous=False, + loop=None, + client_kwargs=None, + get_client=get_client, + encoded=False, + **storage_options, + ): + """ + NB: if this is called async, you must await set_client + + Parameters + ---------- + block_size: int + Blocks to read bytes; if 0, will default to raw requests file-like + objects instead of HTTPFile instances + simple_links: bool + If True, will consider both HTML tags and anything that looks + like a URL; if False, will consider only the former. + same_scheme: True + When doing ls/glob, if this is True, only consider paths that have + http/https matching the input URLs. + size_policy: this argument is deprecated + client_kwargs: dict + Passed to aiohttp.ClientSession, see + https://docs.aiohttp.org/en/stable/client_reference.html + For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` + get_client: Callable[..., aiohttp.ClientSession] + A callable which takes keyword arguments and constructs + an aiohttp.ClientSession. It's state will be managed by + the HTTPFileSystem class. + storage_options: key-value + Any other parameters passed on to requests + cache_type, cache_options: defaults used in open + """ + super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) + self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE + self.simple_links = simple_links + self.same_schema = same_scheme + self.cache_type = cache_type + self.cache_options = cache_options + self.client_kwargs = client_kwargs or {} + self.get_client = get_client + self.encoded = encoded + self.kwargs = storage_options + self._session = None + + # Clean caching-related parameters from `storage_options` + # before propagating them as `request_options` through `self.kwargs`. + # TODO: Maybe rename `self.kwargs` to `self.request_options` to make + # it clearer. + request_options = copy(storage_options) + self.use_listings_cache = request_options.pop("use_listings_cache", False) + request_options.pop("listings_expiry_time", None) + request_options.pop("max_paths", None) + request_options.pop("skip_instance_cache", None) + self.kwargs = request_options + + @property + def fsid(self): + return "http" + + def encode_url(self, url): + return yarl.URL(url, encoded=self.encoded) + + @staticmethod + def close_session(loop, session): + if loop is not None and loop.is_running(): + try: + sync(loop, session.close, timeout=0.1) + return + except (TimeoutError, FSTimeoutError, NotImplementedError): + pass + connector = getattr(session, "_connector", None) + if connector is not None: + # close after loop is dead + connector._close() + + async def set_session(self): + if self._session is None: + self._session = await self.get_client(loop=self.loop, **self.client_kwargs) + if not self.asynchronous: + weakref.finalize(self, self.close_session, self.loop, self._session) + return self._session + + @classmethod + def _strip_protocol(cls, path): + """For HTTP, we always want to keep the full URL""" + return path + + @classmethod + def _parent(cls, path): + # override, since _strip_protocol is different for URLs + par = super()._parent(path) + if len(par) > 7: # "http://..." + return par + return "" + + async def _ls_real(self, url, detail=True, **kwargs): + # ignoring URL-encoded arguments + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + session = await self.set_session() + async with session.get(self.encode_url(url), **self.kwargs) as r: + self._raise_not_found_for_status(r, url) + try: + text = await r.text() + if self.simple_links: + links = ex2.findall(text) + [u[2] for u in ex.findall(text)] + else: + links = [u[2] for u in ex.findall(text)] + except UnicodeDecodeError: + links = [] # binary, not HTML + out = set() + parts = urlparse(url) + for l in links: + if isinstance(l, tuple): + l = l[1] + if l.startswith("/") and len(l) > 1: + # absolute URL on this server + l = f"{parts.scheme}://{parts.netloc}{l}" + if l.startswith("http"): + if self.same_schema and l.startswith(url.rstrip("/") + "/"): + out.add(l) + elif l.replace("https", "http").startswith( + url.replace("https", "http").rstrip("/") + "/" + ): + # allowed to cross http <-> https + out.add(l) + else: + if l not in ["..", "../"]: + # Ignore FTP-like "parent" + out.add("/".join([url.rstrip("/"), l.lstrip("/")])) + if not out and url.endswith("/"): + out = await self._ls_real(url.rstrip("/"), detail=False) + if detail: + return [ + { + "name": u, + "size": None, + "type": "directory" if u.endswith("/") else "file", + } + for u in out + ] + else: + return sorted(out) + + async def _ls(self, url, detail=True, **kwargs): + if self.use_listings_cache and url in self.dircache: + out = self.dircache[url] + else: + out = await self._ls_real(url, detail=detail, **kwargs) + self.dircache[url] = out + return out + + ls = sync_wrapper(_ls) + + def _raise_not_found_for_status(self, response, url): + """ + Raises FileNotFoundError for 404s, otherwise uses raise_for_status. + """ + if response.status == 404: + raise FileNotFoundError(url) + response.raise_for_status() + + async def _cat_file(self, url, start=None, end=None, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + + if start is not None or end is not None: + if start == end: + return b"" + headers = kw.pop("headers", {}).copy() + + headers["Range"] = await self._process_limits(url, start, end) + kw["headers"] = headers + session = await self.set_session() + async with session.get(self.encode_url(url), **kw) as r: + out = await r.read() + self._raise_not_found_for_status(r, url) + return out + + async def _get_file( + self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs + ): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(rpath) + session = await self.set_session() + async with session.get(self.encode_url(rpath), **kw) as r: + try: + size = int(r.headers["content-length"]) + except (ValueError, KeyError): + size = None + + callback.set_size(size) + self._raise_not_found_for_status(r, rpath) + if isfilelike(lpath): + outfile = lpath + else: + outfile = open(lpath, "wb") # noqa: ASYNC101, ASYNC230 + + try: + chunk = True + while chunk: + chunk = await r.content.read(chunk_size) + outfile.write(chunk) + callback.relative_update(len(chunk)) + finally: + if not isfilelike(lpath): + outfile.close() + + async def _put_file( + self, + lpath, + rpath, + chunk_size=5 * 2**20, + callback=DEFAULT_CALLBACK, + method="post", + mode="overwrite", + **kwargs, + ): + if mode != "overwrite": + raise NotImplementedError("Exclusive write") + + async def gen_chunks(): + # Support passing arbitrary file-like objects + # and use them instead of streams. + if isinstance(lpath, io.IOBase): + context = nullcontext(lpath) + use_seek = False # might not support seeking + else: + context = open(lpath, "rb") # noqa: ASYNC101, ASYNC230 + use_seek = True + + with context as f: + if use_seek: + callback.set_size(f.seek(0, 2)) + f.seek(0) + else: + callback.set_size(getattr(f, "size", None)) + + chunk = f.read(chunk_size) + while chunk: + yield chunk + callback.relative_update(len(chunk)) + chunk = f.read(chunk_size) + + kw = self.kwargs.copy() + kw.update(kwargs) + session = await self.set_session() + + method = method.lower() + if method not in ("post", "put"): + raise ValueError( + f"method has to be either 'post' or 'put', not: {method!r}" + ) + + meth = getattr(session, method) + async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp: + self._raise_not_found_for_status(resp, rpath) + + async def _exists(self, path, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + try: + logger.debug(path) + session = await self.set_session() + r = await session.get(self.encode_url(path), **kw) + async with r: + return r.status < 400 + except aiohttp.ClientError: + return False + + async def _isfile(self, path, **kwargs): + return await self._exists(path, **kwargs) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=None, # XXX: This differs from the base class. + cache_type=None, + cache_options=None, + size=None, + **kwargs, + ): + """Make a file-like object + + Parameters + ---------- + path: str + Full URL with protocol + mode: string + must be "rb" + block_size: int or None + Bytes to download in one request; use instance value if None. If + zero, will return a streaming Requests file-like instance. + kwargs: key-value + Any other parameters, passed to requests calls + """ + if mode != "rb": + raise NotImplementedError + block_size = block_size if block_size is not None else self.block_size + kw = self.kwargs.copy() + kw["asynchronous"] = self.asynchronous + kw.update(kwargs) + info = {} + size = size or info.update(self.info(path, **kwargs)) or info["size"] + session = sync(self.loop, self.set_session) + if block_size and size and info.get("partial", True): + return HTTPFile( + self, + path, + session=session, + block_size=block_size, + mode=mode, + size=size, + cache_type=cache_type or self.cache_type, + cache_options=cache_options or self.cache_options, + loop=self.loop, + **kw, + ) + else: + return HTTPStreamFile( + self, + path, + mode=mode, + loop=self.loop, + session=session, + **kw, + ) + + async def open_async(self, path, mode="rb", size=None, **kwargs): + session = await self.set_session() + if size is None: + try: + size = (await self._info(path, **kwargs))["size"] + except FileNotFoundError: + pass + return AsyncStreamFile( + self, + path, + loop=self.loop, + session=session, + size=size, + **kwargs, + ) + + def ukey(self, url): + """Unique identifier; assume HTTP files are static, unchanging""" + return tokenize(url, self.kwargs, self.protocol) + + async def _info(self, url, **kwargs): + """Get info of URL + + Tries to access location via HEAD, and then GET methods, but does + not fetch the data. + + It is possible that the server does not supply any size information, in + which case size will be given as None (and certain operations on the + corresponding file will not work). + """ + info = {} + session = await self.set_session() + + for policy in ["head", "get"]: + try: + info.update( + await _file_info( + self.encode_url(url), + size_policy=policy, + session=session, + **self.kwargs, + **kwargs, + ) + ) + if info.get("size") is not None: + break + except Exception as exc: + if policy == "get": + # If get failed, then raise a FileNotFoundError + raise FileNotFoundError(url) from exc + logger.debug("", exc_info=exc) + + return {"name": url, "size": None, **info, "type": "file"} + + async def _glob(self, path, maxdepth=None, **kwargs): + """ + Find files by glob-matching. + + This implementation is idntical to the one in AbstractFileSystem, + but "?" is not considered as a character for globbing, because it is + so common in URLs, often identifying the "query" part. + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + import re + + ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*")) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_brace) + + detail = kwargs.pop("detail", False) + + if not has_magic(path): + if await self._exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: await self._info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = await self._find( + root, maxdepth=depth, withdirs=True, detail=True, **kwargs + ) + + pattern = glob_translate(path + ("/" if ends_with_slash else "")) + pattern = re.compile(pattern) + + out = { + ( + p.rstrip("/") + if not append_slash_to_dirname + and info["type"] == "directory" + and p.endswith("/") + else p + ): info + for p, info in sorted(allpaths.items()) + if pattern.match(p.rstrip("/")) + } + + if detail: + return out + else: + return list(out) + + async def _isdir(self, path): + # override, since all URLs are (also) files + try: + return bool(await self._ls(path)) + except (FileNotFoundError, ValueError): + return False + + +class HTTPFile(AbstractBufferedFile): + """ + A file-like object pointing to a remote HTTP(S) resource + + Supports only reading, with read-ahead of a predetermined block-size. + + In the case that the server does not supply the filesize, only reading of + the complete file in one go is supported. + + Parameters + ---------- + url: str + Full URL of the remote resource, including the protocol + session: aiohttp.ClientSession or None + All calls will be made within this session, to avoid restarting + connections where the server allows this + block_size: int or None + The amount of read-ahead to do, in bytes. Default is 5MB, or the value + configured for the FileSystem creating this file + size: None or int + If given, this is the size of the file in bytes, and we don't attempt + to call the server to find the value. + kwargs: all other key-values are passed to requests calls. + """ + + def __init__( + self, + fs, + url, + session=None, + block_size=None, + mode="rb", + cache_type="bytes", + cache_options=None, + size=None, + loop=None, + asynchronous=False, + **kwargs, + ): + if mode != "rb": + raise NotImplementedError("File mode not supported") + self.asynchronous = asynchronous + self.loop = loop + self.url = url + self.session = session + self.details = {"name": url, "size": size, "type": "file"} + super().__init__( + fs=fs, + path=url, + mode=mode, + block_size=block_size, + cache_type=cache_type, + cache_options=cache_options, + **kwargs, + ) + + def read(self, length=-1): + """Read bytes from file + + Parameters + ---------- + length: int + Read up to this many bytes. If negative, read all content to end of + file. If the server has not supplied the filesize, attempting to + read only part of the data will raise a ValueError. + """ + if ( + (length < 0 and self.loc == 0) # explicit read all + # but not when the size is known and fits into a block anyways + and not (self.size is not None and self.size <= self.blocksize) + ): + self._fetch_all() + if self.size is None: + if length < 0: + self._fetch_all() + else: + length = min(self.size - self.loc, length) + return super().read(length) + + async def async_fetch_all(self): + """Read whole file in one shot, without caching + + This is only called when position is still at zero, + and read() is called without a byte-count. + """ + logger.debug(f"Fetch all for {self}") + if not isinstance(self.cache, AllBytes): + r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs) + async with r: + r.raise_for_status() + out = await r.read() + self.cache = AllBytes( + size=len(out), fetcher=None, blocksize=None, data=out + ) + self.size = len(out) + + _fetch_all = sync_wrapper(async_fetch_all) + + def _parse_content_range(self, headers): + """Parse the Content-Range header""" + s = headers.get("Content-Range", "") + m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s) + if not m: + return None, None, None + + if m[1] == "*": + start = end = None + else: + start, end = [int(x) for x in m[1].split("-")] + total = None if m[2] == "*" else int(m[2]) + return start, end, total + + async def async_fetch_range(self, start, end): + """Download a block of data + + The expectation is that the server returns only the requested bytes, + with HTTP code 206. If this is not the case, we first check the headers, + and then stream the output - if the data size is bigger than we + requested, an exception is raised. + """ + logger.debug(f"Fetch range for {self}: {start}-{end}") + kwargs = self.kwargs.copy() + headers = kwargs.pop("headers", {}).copy() + headers["Range"] = f"bytes={start}-{end - 1}" + logger.debug(f"{self.url} : {headers['Range']}") + r = await self.session.get( + self.fs.encode_url(self.url), headers=headers, **kwargs + ) + async with r: + if r.status == 416: + # range request outside file + return b"" + r.raise_for_status() + + # If the server has handled the range request, it should reply + # with status 206 (partial content). But we'll guess that a suitable + # Content-Range header or a Content-Length no more than the + # requested range also mean we have got the desired range. + response_is_range = ( + r.status == 206 + or self._parse_content_range(r.headers)[0] == start + or int(r.headers.get("Content-Length", end + 1)) <= end - start + ) + + if response_is_range: + # partial content, as expected + out = await r.read() + elif start > 0: + raise ValueError( + "The HTTP server doesn't appear to support range requests. " + "Only reading this file from the beginning is supported. " + "Open with block_size=0 for a streaming file interface." + ) + else: + # Response is not a range, but we want the start of the file, + # so we can read the required amount anyway. + cl = 0 + out = [] + while True: + chunk = await r.content.read(2**20) + # data size unknown, let's read until we have enough + if chunk: + out.append(chunk) + cl += len(chunk) + if cl > end - start: + break + else: + break + out = b"".join(out)[: end - start] + return out + + _fetch_range = sync_wrapper(async_fetch_range) + + +magic_check = re.compile("([*[])") + + +def has_magic(s): + match = magic_check.search(s) + return match is not None + + +class HTTPStreamFile(AbstractBufferedFile): + def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs): + self.asynchronous = kwargs.pop("asynchronous", False) + self.url = url + self.loop = loop + self.session = session + if mode != "rb": + raise ValueError + self.details = {"name": url, "size": None} + super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs) + + async def cor(): + r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__() + self.fs._raise_not_found_for_status(r, url) + return r + + self.r = sync(self.loop, cor) + self.loop = fs.loop + + def seek(self, loc, whence=0): + if loc == 0 and whence == 1: + return + if loc == self.loc and whence == 0: + return + raise ValueError("Cannot seek streaming HTTP file") + + async def _read(self, num=-1): + out = await self.r.content.read(num) + self.loc += len(out) + return out + + read = sync_wrapper(_read) + + async def _close(self): + self.r.close() + + def close(self): + asyncio.run_coroutine_threadsafe(self._close(), self.loop) + super().close() + + +class AsyncStreamFile(AbstractAsyncStreamedFile): + def __init__( + self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs + ): + self.url = url + self.session = session + self.r = None + if mode != "rb": + raise ValueError + self.details = {"name": url, "size": None} + self.kwargs = kwargs + super().__init__(fs=fs, path=url, mode=mode, cache_type="none") + self.size = size + + async def read(self, num=-1): + if self.r is None: + r = await self.session.get( + self.fs.encode_url(self.url), **self.kwargs + ).__aenter__() + self.fs._raise_not_found_for_status(r, self.url) + self.r = r + out = await self.r.content.read(num) + self.loc += len(out) + return out + + async def close(self): + if self.r is not None: + self.r.close() + self.r = None + await super().close() + + +async def get_range(session, url, start, end, file=None, **kwargs): + # explicit get a range when we know it must be safe + kwargs = kwargs.copy() + headers = kwargs.pop("headers", {}).copy() + headers["Range"] = f"bytes={start}-{end - 1}" + r = await session.get(url, headers=headers, **kwargs) + r.raise_for_status() + async with r: + out = await r.read() + if file: + with open(file, "r+b") as f: # noqa: ASYNC101, ASYNC230 + f.seek(start) + f.write(out) + else: + return out + + +async def _file_info(url, session, size_policy="head", **kwargs): + """Call HEAD on the server to get details about the file (size/checksum etc.) + + Default operation is to explicitly allow redirects and use encoding + 'identity' (no compression) to get the true size of the target. + """ + logger.debug("Retrieve file size for %s", url) + kwargs = kwargs.copy() + ar = kwargs.pop("allow_redirects", True) + head = kwargs.get("headers", {}).copy() + head["Accept-Encoding"] = "identity" + kwargs["headers"] = head + + info = {} + if size_policy == "head": + r = await session.head(url, allow_redirects=ar, **kwargs) + elif size_policy == "get": + r = await session.get(url, allow_redirects=ar, **kwargs) + else: + raise TypeError(f'size_policy must be "head" or "get", got {size_policy}') + async with r: + r.raise_for_status() + + if "Content-Length" in r.headers: + # Some servers may choose to ignore Accept-Encoding and return + # compressed content, in which case the returned size is unreliable. + if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [ + "identity", + "", + ]: + info["size"] = int(r.headers["Content-Length"]) + elif "Content-Range" in r.headers: + info["size"] = int(r.headers["Content-Range"].split("/")[1]) + + if "Content-Type" in r.headers: + info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + + if r.headers.get("Accept-Ranges") == "none": + # Some servers may explicitly discourage partial content requests, but + # the lack of "Accept-Ranges" does not always indicate they would fail + info["partial"] = False + + info["url"] = str(r.url) + + for checksum_field in ["ETag", "Content-MD5", "Digest"]: + if r.headers.get(checksum_field): + info[checksum_field] = r.headers[checksum_field] + + return info + + +async def _file_size(url, session=None, *args, **kwargs): + if session is None: + session = await get_client() + info = await _file_info(url, session=session, *args, **kwargs) + return info.get("size") + + +file_size = sync_wrapper(_file_size) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/libarchive.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/libarchive.py new file mode 100644 index 0000000000000000000000000000000000000000..eb6f145352e1989e0477e259be02d8d7f4d729e2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/libarchive.py @@ -0,0 +1,213 @@ +from contextlib import contextmanager +from ctypes import ( + CFUNCTYPE, + POINTER, + c_int, + c_longlong, + c_void_p, + cast, + create_string_buffer, +) + +import libarchive +import libarchive.ffi as ffi + +from fsspec import open_files +from fsspec.archive import AbstractArchiveFileSystem +from fsspec.implementations.memory import MemoryFile +from fsspec.utils import DEFAULT_BLOCK_SIZE + +# Libarchive requires seekable files or memory only for certain archive +# types. However, since we read the directory first to cache the contents +# and also allow random access to any file, the file-like object needs +# to be seekable no matter what. + +# Seek call-backs (not provided in the libarchive python wrapper) +SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int) +read_set_seek_callback = ffi.ffi( + "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int +) +new_api = hasattr(ffi, "NO_OPEN_CB") + + +@contextmanager +def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size): + """Read an archive from a seekable file-like object. + + The `file` object must support the standard `readinto` and 'seek' methods. + """ + buf = create_string_buffer(block_size) + buf_p = cast(buf, c_void_p) + + def read_func(archive_p, context, ptrptr): + # readinto the buffer, returns number of bytes read + length = file.readinto(buf) + # write the address of the buffer into the pointer + ptrptr = cast(ptrptr, POINTER(c_void_p)) + ptrptr[0] = buf_p + # tell libarchive how much data was written into the buffer + return length + + def seek_func(archive_p, context, offset, whence): + file.seek(offset, whence) + # tell libarchvie the current position + return file.tell() + + read_cb = ffi.READ_CALLBACK(read_func) + seek_cb = SEEK_CALLBACK(seek_func) + + if new_api: + open_cb = ffi.NO_OPEN_CB + close_cb = ffi.NO_CLOSE_CB + else: + open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB) + close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB) + + with libarchive.read.new_archive_read(format_name, filter_name) as archive_p: + read_set_seek_callback(archive_p, seek_cb) + ffi.read_open(archive_p, None, open_cb, read_cb, close_cb) + yield libarchive.read.ArchiveRead(archive_p) + + +class LibArchiveFileSystem(AbstractArchiveFileSystem): + """Compressed archives as a file-system (read-only) + + Supports the following formats: + tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar + Microsoft CAB, 7-Zip, WARC + + See the libarchive documentation for further restrictions. + https://www.libarchive.org/ + + Keeps file object open while instance lives. It only works in seekable + file-like objects. In case the filesystem does not support this kind of + file object, it is recommended to cache locally. + + This class is pickleable, but not necessarily thread-safe (depends on the + platform). See libarchive documentation for details. + """ + + root_marker = "" + protocol = "libarchive" + cachable = False + + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + block_size=DEFAULT_BLOCK_SIZE, + **kwargs, + ): + """ + Parameters + ---------- + fo: str or file-like + Contains ZIP, and must exist. If a str, will fetch file using + :meth:`~fsspec.open_files`, which must return one file exactly. + mode: str + Currently, only 'r' accepted + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. + """ + super().__init__(self, **kwargs) + if mode != "r": + raise ValueError("Only read from archive files accepted") + if isinstance(fo, str): + files = open_files(fo, protocol=target_protocol, **(target_options or {})) + if len(files) != 1: + raise ValueError( + f'Path "{fo}" did not resolve to exactly one file: "{files}"' + ) + fo = files[0] + self.of = fo + self.fo = fo.__enter__() # the whole instance is a context + self.block_size = block_size + self.dir_cache = None + + @contextmanager + def _open_archive(self): + self.fo.seek(0) + with custom_reader(self.fo, block_size=self.block_size) as arc: + yield arc + + @classmethod + def _strip_protocol(cls, path): + # file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + def _get_dirs(self): + fields = { + "name": "pathname", + "size": "size", + "created": "ctime", + "mode": "mode", + "uid": "uid", + "gid": "gid", + "mtime": "mtime", + } + + if self.dir_cache is not None: + return + + self.dir_cache = {} + list_names = [] + with self._open_archive() as arc: + for entry in arc: + if not entry.isdir and not entry.isfile: + # Skip symbolic links, fifo entries, etc. + continue + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(set(entry.name)) + } + ) + f = {key: getattr(entry, fields[key]) for key in fields} + f["type"] = "directory" if entry.isdir else "file" + list_names.append(entry.name) + + self.dir_cache[f["name"]] = f + # libarchive does not seem to return an entry for the directories (at least + # not in all formats), so get the directories names from the files names + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(list_names) + } + ) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if mode != "rb": + raise NotImplementedError + + data = bytes() + with self._open_archive() as arc: + for entry in arc: + if entry.pathname != path: + continue + + if entry.size == 0: + # empty file, so there are no blocks + break + + for block in entry.get_blocks(entry.size): + data = block + break + else: + raise ValueError + return MemoryFile(fs=self, path=path, data=data) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/memory.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..c1c526b564861b6a7f4fa604304b2db103926b4c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/memory.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from errno import ENOTEMPTY +from io import BytesIO +from pathlib import PurePath, PureWindowsPath +from typing import Any, ClassVar + +from fsspec import AbstractFileSystem +from fsspec.implementations.local import LocalFileSystem +from fsspec.utils import stringify_path + +logger = logging.getLogger("fsspec.memoryfs") + + +class MemoryFileSystem(AbstractFileSystem): + """A filesystem based on a dict of BytesIO objects + + This is a global filesystem so instances of this class all point to the same + in memory filesystem. + """ + + store: ClassVar[dict[str, Any]] = {} # global, do not overwrite! + pseudo_dirs = [""] # global, do not overwrite! + protocol = "memory" + root_marker = "/" + + @classmethod + def _strip_protocol(cls, path): + if isinstance(path, PurePath): + if isinstance(path, PureWindowsPath): + return LocalFileSystem._strip_protocol(path) + else: + path = stringify_path(path) + + if path.startswith("memory://"): + path = path[len("memory://") :] + if "::" in path or "://" in path: + return path.rstrip("/") + path = path.lstrip("/").rstrip("/") + return "/" + path if path else "" + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + if path in self.store: + # there is a key with this exact name + if not detail: + return [path] + return [ + { + "name": path, + "size": self.store[path].size, + "type": "file", + "created": self.store[path].created.timestamp(), + } + ] + paths = set() + starter = path + "/" + out = [] + for p2 in tuple(self.store): + if p2.startswith(starter): + if "/" not in p2[len(starter) :]: + # exact child + out.append( + { + "name": p2, + "size": self.store[p2].size, + "type": "file", + "created": self.store[p2].created.timestamp(), + } + ) + elif len(p2) > len(starter): + # implied child directory + ppath = starter + p2[len(starter) :].split("/", 1)[0] + if ppath not in paths: + out = out or [] + out.append( + { + "name": ppath, + "size": 0, + "type": "directory", + } + ) + paths.add(ppath) + for p2 in self.pseudo_dirs: + if p2.startswith(starter): + if "/" not in p2[len(starter) :]: + # exact child pdir + if p2 not in paths: + out.append({"name": p2, "size": 0, "type": "directory"}) + paths.add(p2) + else: + # directory implied by deeper pdir + ppath = starter + p2[len(starter) :].split("/", 1)[0] + if ppath not in paths: + out.append({"name": ppath, "size": 0, "type": "directory"}) + paths.add(ppath) + if not out: + if path in self.pseudo_dirs: + # empty dir + return [] + raise FileNotFoundError(path) + if detail: + return out + return sorted([f["name"] for f in out]) + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if path in self.store or path in self.pseudo_dirs: + raise FileExistsError(path) + if self._parent(path).strip("/") and self.isfile(self._parent(path)): + raise NotADirectoryError(self._parent(path)) + if create_parents and self._parent(path).strip("/"): + try: + self.mkdir(self._parent(path), create_parents, **kwargs) + except FileExistsError: + pass + if path and path not in self.pseudo_dirs: + self.pseudo_dirs.append(path) + + def makedirs(self, path, exist_ok=False): + try: + self.mkdir(path, create_parents=True) + except FileExistsError: + if not exist_ok: + raise + + def pipe_file(self, path, value, mode="overwrite", **kwargs): + """Set the bytes of given file + + Avoids copies of the data if possible + """ + mode = "xb" if mode == "create" else "wb" + self.open(path, mode=mode, data=value) + + def rmdir(self, path): + path = self._strip_protocol(path) + if path == "": + # silently avoid deleting FS root + return + if path in self.pseudo_dirs: + if not self.ls(path): + self.pseudo_dirs.remove(path) + else: + raise OSError(ENOTEMPTY, "Directory not empty", path) + else: + raise FileNotFoundError(path) + + def info(self, path, **kwargs): + logger.debug("info: %s", path) + path = self._strip_protocol(path) + if path in self.pseudo_dirs or any( + p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs + ): + return { + "name": path, + "size": 0, + "type": "directory", + } + elif path in self.store: + filelike = self.store[path] + return { + "name": path, + "size": filelike.size, + "type": "file", + "created": getattr(filelike, "created", None), + } + else: + raise FileNotFoundError(path) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if "x" in mode and self.exists(path): + raise FileExistsError + if path in self.pseudo_dirs: + raise IsADirectoryError(path) + parent = path + while len(parent) > 1: + parent = self._parent(parent) + if self.isfile(parent): + raise FileExistsError(parent) + if mode in ["rb", "ab", "r+b"]: + if path in self.store: + f = self.store[path] + if mode == "ab": + # position at the end of file + f.seek(0, 2) + else: + # position at the beginning of file + f.seek(0) + return f + else: + raise FileNotFoundError(path) + elif mode in {"wb", "xb"}: + if mode == "xb" and self.exists(path): + raise FileExistsError + m = MemoryFile(self, path, kwargs.get("data")) + if not self._intrans: + m.commit() + return m + else: + name = self.__class__.__name__ + raise ValueError(f"unsupported file mode for {name}: {mode!r}") + + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + if self.isfile(path1): + self.store[path2] = MemoryFile( + self, path2, self.store[path1].getvalue() + ) # implicit copy + elif self.isdir(path1): + if path2 not in self.pseudo_dirs: + self.pseudo_dirs.append(path2) + else: + raise FileNotFoundError(path1) + + def cat_file(self, path, start=None, end=None, **kwargs): + logger.debug("cat: %s", path) + path = self._strip_protocol(path) + try: + return bytes(self.store[path].getbuffer()[start:end]) + except KeyError as e: + raise FileNotFoundError(path) from e + + def _rm(self, path): + path = self._strip_protocol(path) + try: + del self.store[path] + except KeyError as e: + raise FileNotFoundError(path) from e + + def modified(self, path): + path = self._strip_protocol(path) + try: + return self.store[path].modified + except KeyError as e: + raise FileNotFoundError(path) from e + + def created(self, path): + path = self._strip_protocol(path) + try: + return self.store[path].created + except KeyError as e: + raise FileNotFoundError(path) from e + + def isfile(self, path): + path = self._strip_protocol(path) + return path in self.store + + def rm(self, path, recursive=False, maxdepth=None): + if isinstance(path, str): + path = self._strip_protocol(path) + else: + path = [self._strip_protocol(p) for p in path] + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(paths): + if self.isfile(p): + self.rm_file(p) + # If the expanded path doesn't exist, it is only because the expanded + # path was a directory that does not exist in self.pseudo_dirs. This + # is possible if you directly create files without making the + # directories first. + elif not self.exists(p): + continue + else: + self.rmdir(p) + + +class MemoryFile(BytesIO): + """A BytesIO which can't close and works as a context manager + + Can initialise with data. Each path should only be active once at any moment. + + No need to provide fs, path if auto-committing (default) + """ + + def __init__(self, fs=None, path=None, data=None): + logger.debug("open file %s", path) + self.fs = fs + self.path = path + self.created = datetime.now(tz=timezone.utc) + self.modified = datetime.now(tz=timezone.utc) + if data: + super().__init__(data) + self.seek(0) + + @property + def size(self): + return self.getbuffer().nbytes + + def __enter__(self): + return self + + def close(self): + pass + + def discard(self): + pass + + def commit(self): + self.fs.store[self.path] = self + self.modified = datetime.now(tz=timezone.utc) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/smb.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/smb.py new file mode 100644 index 0000000000000000000000000000000000000000..db6b3f5c3702de90cf121ccca49f3ca2b580df9f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/smb.py @@ -0,0 +1,416 @@ +""" +This module contains SMBFileSystem class responsible for handling access to +Windows Samba network shares by using package smbprotocol +""" + +import datetime +import re +import uuid +from stat import S_ISDIR, S_ISLNK + +import smbclient +import smbprotocol.exceptions + +from .. import AbstractFileSystem +from ..utils import infer_storage_options + +# ! pylint: disable=bad-continuation + + +class SMBFileSystem(AbstractFileSystem): + """Allow reading and writing to Windows and Samba network shares. + + When using `fsspec.open()` for getting a file-like object the URI + should be specified as this format: + ``smb://workgroup;user:password@server:port/share/folder/file.csv``. + + Example:: + + >>> import fsspec + >>> with fsspec.open( + ... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv' + ... ) as smbfile: + ... df = pd.read_csv(smbfile, sep='|', header=None) + + Note that you need to pass in a valid hostname or IP address for the host + component of the URL. Do not use the Windows/NetBIOS machine name for the + host component. + + The first component of the path in the URL points to the name of the shared + folder. Subsequent path components will point to the directory/folder/file. + + The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be + optional. + + .. note:: + + For working this source require `smbprotocol`_ to be installed, e.g.:: + + $ pip install smbprotocol + # or + # pip install smbprotocol[kerberos] + + .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. + """ + + protocol = "smb" + + # pylint: disable=too-many-arguments + def __init__( + self, + host, + port=None, + username=None, + password=None, + timeout=60, + encrypt=None, + share_access=None, + register_session_retries=4, + register_session_retry_wait=1, + register_session_retry_factor=10, + auto_mkdir=False, + **kwargs, + ): + """ + You can use _get_kwargs_from_urls to get some kwargs from + a reasonable SMB url. + + Authentication will be anonymous or integrated if username/password are not + given. + + Parameters + ---------- + host: str + The remote server name/ip to connect to + port: int or None + Port to connect with. Usually 445, sometimes 139. + username: str or None + Username to connect with. Required if Kerberos auth is not being used. + password: str or None + User's password on the server, if using username + timeout: int + Connection timeout in seconds + encrypt: bool + Whether to force encryption or not, once this has been set to True + the session cannot be changed back to False. + share_access: str or None + Specifies the default access applied to file open operations + performed with this file system object. + This affects whether other processes can concurrently open a handle + to the same file. + + - None (the default): exclusively locks the file until closed. + - 'r': Allow other handles to be opened with read access. + - 'w': Allow other handles to be opened with write access. + - 'd': Allow other handles to be opened with delete access. + register_session_retries: int + Number of retries to register a session with the server. Retries are not performed + for authentication errors, as they are considered as invalid credentials and not network + issues. If set to negative value, no register attempts will be performed. + register_session_retry_wait: int + Time in seconds to wait between each retry. Number must be non-negative. + register_session_retry_factor: int + Base factor for the wait time between each retry. The wait time + is calculated using exponential function. For factor=1 all wait times + will be equal to `register_session_retry_wait`. For any number of retries, + the last wait time will be equal to `register_session_retry_wait` and for retries>1 + the first wait time will be equal to `register_session_retry_wait / factor`. + Number must be equal to or greater than 1. Optimal factor is 10. + auto_mkdir: bool + Whether, when opening a file, the directory containing it should + be created (if it doesn't already exist). This is assumed by pyarrow + and zarr-python code. + """ + super().__init__(**kwargs) + self.host = host + self.port = port + self.username = username + self.password = password + self.timeout = timeout + self.encrypt = encrypt + self.temppath = kwargs.pop("temppath", "") + self.share_access = share_access + self.register_session_retries = register_session_retries + if register_session_retry_wait < 0: + raise ValueError( + "register_session_retry_wait must be a non-negative integer" + ) + self.register_session_retry_wait = register_session_retry_wait + if register_session_retry_factor < 1: + raise ValueError( + "register_session_retry_factor must be a positive " + "integer equal to or greater than 1" + ) + self.register_session_retry_factor = register_session_retry_factor + self.auto_mkdir = auto_mkdir + self._connect() + + @property + def _port(self): + return 445 if self.port is None else self.port + + def _connect(self): + import time + + if self.register_session_retries <= -1: + return + + retried_errors = [] + + wait_time = self.register_session_retry_wait + n_waits = ( + self.register_session_retries - 1 + ) # -1 = No wait time after the last retry + factor = self.register_session_retry_factor + + # Generate wait times for each retry attempt. + # Wait times are calculated using exponential function. For factor=1 all wait times + # will be equal to `wait`. For any number of retries the last wait time will be + # equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`. + wait_times = iter( + factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1) + ) + + for attempt in range(self.register_session_retries + 1): + try: + smbclient.register_session( + self.host, + username=self.username, + password=self.password, + port=self._port, + encrypt=self.encrypt, + connection_timeout=self.timeout, + ) + return + except ( + smbprotocol.exceptions.SMBAuthenticationError, + smbprotocol.exceptions.LogonFailure, + ): + # These exceptions should not be repeated, as they clearly indicate + # that the credentials are invalid and not a network issue. + raise + except ValueError as exc: + if re.findall(r"\[Errno -\d+]", str(exc)): + # This exception is raised by the smbprotocol.transport:Tcp.connect + # and originates from socket.gaierror (OSError). These exceptions might + # be raised due to network instability. We will retry to connect. + retried_errors.append(exc) + else: + # All another ValueError exceptions should be raised, as they are not + # related to network issues. + raise + except Exception as exc: + # Save the exception and retry to connect. This except might be dropped + # in the future, once all exceptions suited for retry are identified. + retried_errors.append(exc) + + if attempt < self.register_session_retries: + time.sleep(next(wait_times)) + + # Raise last exception to inform user about the connection issues. + # Note: Should we use ExceptionGroup to raise all exceptions? + raise retried_errors[-1] + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(path): + # smb://workgroup;user:password@host:port/share/folder/file.csv + out = infer_storage_options(path) + out.pop("path", None) + out.pop("protocol", None) + return out + + def mkdir(self, path, create_parents=True, **kwargs): + wpath = _as_unc_path(self.host, path) + if create_parents: + smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs) + else: + smbclient.mkdir(wpath, port=self._port, **kwargs) + + def makedirs(self, path, exist_ok=False): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port) + + def rmdir(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.rmdir(wpath, port=self._port) + + def info(self, path, **kwargs): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port, **kwargs) + if S_ISDIR(stats.st_mode): + stype = "directory" + elif S_ISLNK(stats.st_mode): + stype = "link" + else: + stype = "file" + res = { + "name": path + "/" if stype == "directory" else path, + "size": stats.st_size, + "type": stype, + "uid": stats.st_uid, + "gid": stats.st_gid, + "time": stats.st_atime, + "mtime": stats.st_mtime, + } + return res + + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc) + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc) + + def ls(self, path, detail=True, **kwargs): + unc = _as_unc_path(self.host, path) + listed = smbclient.listdir(unc, port=self._port, **kwargs) + dirs = ["/".join([path.rstrip("/"), p]) for p in listed] + if detail: + dirs = [self.info(d) for d in dirs] + return dirs + + # pylint: disable=too-many-arguments + def _open( + self, + path, + mode="rb", + block_size=-1, + autocommit=True, + cache_options=None, + **kwargs, + ): + """ + block_size: int or None + If 0, no buffering, 1, line buffering, >1, buffer that many bytes + + Notes + ----- + By specifying 'share_access' in 'kwargs' it is possible to override the + default shared access setting applied in the constructor of this object. + """ + if self.auto_mkdir and "w" in mode: + self.makedirs(self._parent(path), exist_ok=True) + bls = block_size if block_size is not None and block_size >= 0 else -1 + wpath = _as_unc_path(self.host, path) + share_access = kwargs.pop("share_access", self.share_access) + if "w" in mode and autocommit is False: + temp = _as_temp_path(self.host, path, self.temppath) + return SMBFileOpener( + wpath, temp, mode, port=self._port, block_size=bls, **kwargs + ) + return smbclient.open_file( + wpath, + mode, + buffering=bls, + share_access=share_access, + port=self._port, + **kwargs, + ) + + def copy(self, path1, path2, **kwargs): + """Copy within two locations in the same filesystem""" + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + if self.auto_mkdir: + self.makedirs(self._parent(path2), exist_ok=True) + smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs) + + def _rm(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + if S_ISDIR(stats.st_mode): + smbclient.rmdir(wpath, port=self._port) + else: + smbclient.remove(wpath, port=self._port) + + def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs): + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + smbclient.rename(wpath1, wpath2, port=self._port, **kwargs) + + +def _as_unc_path(host, path): + rpath = path.replace("/", "\\") + unc = f"\\\\{host}{rpath}" + return unc + + +def _as_temp_path(host, path, temppath): + share = path.split("/")[1] + temp_file = f"/{share}{temppath}/{uuid.uuid4()}" + unc = _as_unc_path(host, temp_file) + return unc + + +def _share_has_path(path): + parts = path.count("/") + if path.endswith("/"): + return parts > 2 + return parts > 1 + + +class SMBFileOpener: + """writes to remote temporary file, move on commit""" + + def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs): + self.path = path + self.temp = temp + self.mode = mode + self.block_size = block_size + self.kwargs = kwargs + self.smbfile = None + self._incontext = False + self.port = port + self._open() + + def _open(self): + if self.smbfile is None or self.smbfile.closed: + self.smbfile = smbclient.open_file( + self.temp, + self.mode, + port=self.port, + buffering=self.block_size, + **self.kwargs, + ) + + def commit(self): + """Move temp file to definitive on success.""" + # TODO: use transaction support in SMB protocol + smbclient.replace(self.temp, self.path, port=self.port) + + def discard(self): + """Remove the temp file on failure.""" + smbclient.remove(self.temp, port=self.port) + + def __fspath__(self): + return self.path + + def __iter__(self): + return self.smbfile.__iter__() + + def __getattr__(self, item): + return getattr(self.smbfile, item) + + def __enter__(self): + self._incontext = True + return self.smbfile.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + self._incontext = False + self.smbfile.__exit__(exc_type, exc_value, traceback) diff --git a/.venv/lib/python3.11/site-packages/fsspec/implementations/zip.py b/.venv/lib/python3.11/site-packages/fsspec/implementations/zip.py new file mode 100644 index 0000000000000000000000000000000000000000..6db3ae27806106a19a366886ab4b183f85c1cb1a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/implementations/zip.py @@ -0,0 +1,177 @@ +import os +import zipfile + +import fsspec +from fsspec.archive import AbstractArchiveFileSystem + + +class ZipFileSystem(AbstractArchiveFileSystem): + """Read/Write contents of ZIP archive as a file-system + + Keeps file object open while instance lives. + + This class is pickleable, but not necessarily thread-safe + """ + + root_marker = "" + protocol = "zip" + cachable = False + + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + compression=zipfile.ZIP_STORED, + allowZip64=True, + compresslevel=None, + **kwargs, + ): + """ + Parameters + ---------- + fo: str or file-like + Contains ZIP, and must exist. If a str, will fetch file using + :meth:`~fsspec.open_files`, which must return one file exactly. + mode: str + Accept: "r", "w", "a" + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. + compression, allowZip64, compresslevel: passed to ZipFile + Only relevant when creating a ZIP + """ + super().__init__(self, **kwargs) + if mode not in set("rwa"): + raise ValueError(f"mode '{mode}' no understood") + self.mode = mode + if isinstance(fo, (str, os.PathLike)): + if mode == "a": + m = "r+b" + else: + m = mode + "b" + fo = fsspec.open( + fo, mode=m, protocol=target_protocol, **(target_options or {}) + ) + self.force_zip_64 = allowZip64 + self.of = fo + self.fo = fo.__enter__() # the whole instance is a context + self.zip = zipfile.ZipFile( + self.fo, + mode=mode, + compression=compression, + allowZip64=allowZip64, + compresslevel=compresslevel, + ) + self.dir_cache = None + + @classmethod + def _strip_protocol(cls, path): + # zip file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + def __del__(self): + if hasattr(self, "zip"): + self.close() + del self.zip + + def close(self): + """Commits any write changes to the file. Done on ``del`` too.""" + self.zip.close() + + def _get_dirs(self): + if self.dir_cache is None or self.mode in set("wa"): + # when writing, dir_cache is always in the ZipFile's attributes, + # not read from the file. + files = self.zip.infolist() + self.dir_cache = { + dirname.rstrip("/"): { + "name": dirname.rstrip("/"), + "size": 0, + "type": "directory", + } + for dirname in self._all_dirnames(self.zip.namelist()) + } + for z in files: + f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__} + f.update( + { + "name": z.filename.rstrip("/"), + "size": z.file_size, + "type": ("directory" if z.is_dir() else "file"), + } + ) + self.dir_cache[f["name"]] = f + + def pipe_file(self, path, value, **kwargs): + # override upstream, because we know the exact file size in this case + self.zip.writestr(path, value, **kwargs) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if "r" in mode and self.mode in set("wa"): + if self.exists(path): + raise OSError("ZipFS can only be open for reading or writing, not both") + raise FileNotFoundError(path) + if "r" in self.mode and "w" in mode: + raise OSError("ZipFS can only be open for reading or writing, not both") + out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64) + if "r" in mode: + info = self.info(path) + out.size = info["size"] + out.name = info["name"] + return out + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + # Remove the leading slash, as the zip file paths are always + # given without a leading slash + path = path.lstrip("/") + path_parts = list(filter(lambda s: bool(s), path.split("/"))) + + def _matching_starts(file_path): + file_parts = filter(lambda s: bool(s), file_path.split("/")) + return all(a == b for a, b in zip(path_parts, file_parts)) + + self._get_dirs() + + result = {} + # To match posix find, if an exact file name is given, we should + # return only that file + if path in self.dir_cache and self.dir_cache[path]["type"] == "file": + result[path] = self.dir_cache[path] + return result if detail else [path] + + for file_path, file_info in self.dir_cache.items(): + if not (path == "" or _matching_starts(file_path)): + continue + + if file_info["type"] == "directory": + if withdirs: + if file_path not in result: + result[file_path.strip("/")] = file_info + continue + + if file_path not in result: + result[file_path] = file_info if detail else None + + if maxdepth: + path_depth = path.count("/") + result = { + k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth + } + return result if detail else sorted(result) diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..793007bdfc68044137b264f308174a6f99ff0ae8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cef4d7a942cb7a8187c52579e04a949ed9277495 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e33bbbcb22c89965e66abbb76e7f937c3702ab6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d96787fc410572f96066fc584512f18d1e0c937a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/mv.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-311.pyc b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63ad541a0e5ee7a7b6ce04d2989c406cab9f6044 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/open.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/common.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/common.py new file mode 100644 index 0000000000000000000000000000000000000000..22e7c4140404ab2a8928689721419cf05c2760b9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/common.py @@ -0,0 +1,175 @@ +GLOB_EDGE_CASES_TESTS = { + "argnames": ("path", "recursive", "maxdepth", "expected"), + "argvalues": [ + ("fil?1", False, None, ["file1"]), + ("fil?1", True, None, ["file1"]), + ("file[1-2]", False, None, ["file1", "file2"]), + ("file[1-2]", True, None, ["file1", "file2"]), + ("*", False, None, ["file1", "file2"]), + ( + "*", + True, + None, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir0/nesteddir/nestedfile", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ("*", True, 1, ["file1", "file2"]), + ( + "*", + True, + 2, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir1/subfile1", + "subdir1/subfile2", + ], + ), + ("*1", False, None, ["file1"]), + ( + "*1", + True, + None, + [ + "file1", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ("*1", True, 2, ["file1", "subdir1/subfile1", "subdir1/subfile2"]), + ( + "**", + False, + None, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir0/nesteddir/nestedfile", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ( + "**", + True, + None, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir0/nesteddir/nestedfile", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ("**", True, 1, ["file1", "file2"]), + ( + "**", + True, + 2, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir0/nesteddir/nestedfile", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ( + "**", + False, + 2, + [ + "file1", + "file2", + "subdir0/subfile1", + "subdir0/subfile2", + "subdir1/subfile1", + "subdir1/subfile2", + ], + ), + ("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), + ( + "**/*1", + True, + None, + [ + "file1", + "subdir0/subfile1", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ("**/*1", True, 1, ["file1"]), + ( + "**/*1", + True, + 2, + ["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"], + ), + ("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), + ("**/subdir0", False, None, []), + ("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]), + ("**/subdir0/nested*", False, 2, []), + ("**/subdir0/nested*", True, 2, ["nestedfile"]), + ("subdir[1-2]", False, None, []), + ("subdir[1-2]", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]), + ("subdir[1-2]", True, 2, ["subfile1", "subfile2"]), + ("subdir[0-1]", False, None, []), + ( + "subdir[0-1]", + True, + None, + [ + "subdir0/subfile1", + "subdir0/subfile2", + "subdir0/nesteddir/nestedfile", + "subdir1/subfile1", + "subdir1/subfile2", + "subdir1/nesteddir/nestedfile", + ], + ), + ( + "subdir[0-1]/*fil[e]*", + False, + None, + [ + "subdir0/subfile1", + "subdir0/subfile2", + "subdir1/subfile1", + "subdir1/subfile2", + ], + ), + ( + "subdir[0-1]/*fil[e]*", + True, + None, + [ + "subdir0/subfile1", + "subdir0/subfile2", + "subdir1/subfile1", + "subdir1/subfile2", + ], + ), + ], +} diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/copy.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/copy.py new file mode 100644 index 0000000000000000000000000000000000000000..e39e57e5f7d52bfda8ab5e2398b04cc2303630a0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/copy.py @@ -0,0 +1,557 @@ +from hashlib import md5 +from itertools import product + +import pytest + +from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS + + +class AbstractCopyTests: + def test_copy_file_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 1a + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + target_file2 = fs_join(target, "file2") + target_subfile1 = fs_join(target, "subfile1") + + # Copy from source directory + fs.cp(fs_join(source, "file2"), target) + assert fs.isfile(target_file2) + + # Copy from sub directory + fs.cp(fs_join(source, "subdir", "subfile1"), target) + assert fs.isfile(target_subfile1) + + # Remove copied files + fs.rm([target_file2, target_subfile1]) + assert not fs.exists(target_file2) + assert not fs.exists(target_subfile1) + + # Repeat with trailing slash on target + fs.cp(fs_join(source, "file2"), target + "/") + assert fs.isdir(target) + assert fs.isfile(target_file2) + + fs.cp(fs_join(source, "subdir", "subfile1"), target + "/") + assert fs.isfile(target_subfile1) + + def test_copy_file_to_new_directory( + self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target + ): + # Copy scenario 1b + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.cp( + fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir/") + ) # Note trailing slash + assert fs.isdir(target) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_copy_file_to_file_in_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 1c + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + fs.cp(fs_join(source, "subdir", "subfile1"), fs_join(target, "newfile")) + assert fs.isfile(fs_join(target, "newfile")) + + def test_copy_file_to_file_in_new_directory( + self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target + ): + # Copy scenario 1d + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.cp( + fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir", "newfile") + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "newfile")) + + def test_copy_directory_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 1e + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = target + "/" if target_slash else target + + # Without recursive does nothing + fs.cp(s, t) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + fs.cp(s, t, recursive=True) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert fs.isdir(fs_join(target, "subdir", "nesteddir")) + assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.cp(s, t, recursive=True, maxdepth=1) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert not fs.exists(fs_join(target, "subdir", "nesteddir")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_copy_directory_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 1f + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive does nothing + fs.cp(s, t) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + with pytest.raises(FileNotFoundError): + fs.ls(target) + + # With recursive + fs.cp(s, t, recursive=True) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.cp(s, t, recursive=True, maxdepth=1) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + def test_copy_glob_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 1g + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + # Without recursive + fs.cp(fs_join(source, "subdir", "*"), t) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.isdir(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.cp( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_copy_glob_to_new_directory( + self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target + ): + # Copy scenario 1h + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for target_slash in [False, True]: + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive + fs.cp(fs_join(source, "subdir", "*"), t) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.cp( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + @pytest.mark.parametrize( + GLOB_EDGE_CASES_TESTS["argnames"], + GLOB_EDGE_CASES_TESTS["argvalues"], + ) + def test_copy_glob_edge_cases( + self, + path, + recursive, + maxdepth, + expected, + fs, + fs_join, + fs_glob_edge_cases_files, + fs_target, + fs_sanitize_path, + ): + # Copy scenario 1g + source = fs_glob_edge_cases_files + + target = fs_target + + for new_dir, target_slash in product([True, False], [True, False]): + fs.mkdir(target) + + t = fs_join(target, "newdir") if new_dir else target + t = t + "/" if target_slash else t + + fs.copy(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth) + + output = fs.find(target) + if new_dir: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected + ] + else: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, p)) for p in expected + ] + assert sorted(output) == sorted(prefixed_expected) + + try: + fs.rm(target, recursive=True) + except FileNotFoundError: + pass + + def test_copy_list_of_files_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + fs_target, + supports_empty_directories, + ): + # Copy scenario 2a + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + fs.cp(source_files, t) + assert fs.isfile(fs_join(target, "file1")) + assert fs.isfile(fs_join(target, "file2")) + assert fs.isfile(fs_join(target, "subfile1")) + + fs.rm( + [ + fs_join(target, "file1"), + fs_join(target, "file2"), + fs_join(target, "subfile1"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_copy_list_of_files_to_new_directory( + self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target + ): + # Copy scenario 2b + source = fs_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + fs.cp(source_files, fs_join(target, "newdir") + "/") # Note trailing slash + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "file1")) + assert fs.isfile(fs_join(target, "newdir", "file2")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_copy_two_files_new_directory( + self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target + ): + # This is a duplicate of test_copy_list_of_files_to_new_directory and + # can eventually be removed. + source = fs_bulk_operations_scenario_0 + + target = fs_target + assert not fs.exists(target) + fs.cp([fs_join(source, "file1"), fs_join(source, "file2")], target) + + assert fs.isdir(target) + assert fs.isfile(fs_join(target, "file1")) + assert fs.isfile(fs_join(target, "file2")) + + def test_copy_directory_without_files_with_same_name_prefix( + self, + fs, + fs_join, + fs_target, + fs_dir_and_file_with_same_name_prefix, + supports_empty_directories, + ): + # Create the test dirs + source = fs_dir_and_file_with_same_name_prefix + target = fs_target + + # Test without glob + fs.cp(fs_join(source, "subdir"), target, recursive=True) + + assert fs.isfile(fs_join(target, "subfile.txt")) + assert not fs.isfile(fs_join(target, "subdir.txt")) + + fs.rm([fs_join(target, "subfile.txt")]) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + assert not fs.exists(target) + + # Test with glob + fs.cp(fs_join(source, "subdir*"), target, recursive=True) + + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile.txt")) + assert fs.isfile(fs_join(target, "subdir.txt")) + + def test_copy_with_source_and_destination_as_list( + self, fs, fs_target, fs_join, fs_10_files_with_hashed_names + ): + # Create the test dir + source = fs_10_files_with_hashed_names + target = fs_target + + # Create list of files for source and destination + source_files = [] + destination_files = [] + for i in range(10): + hashed_i = md5(str(i).encode("utf-8")).hexdigest() + source_files.append(fs_join(source, f"{hashed_i}.txt")) + destination_files.append(fs_join(target, f"{hashed_i}.txt")) + + # Copy and assert order was kept + fs.copy(path1=source_files, path2=destination_files) + + for i in range(10): + file_content = fs.cat(destination_files[i]).decode("utf-8") + assert file_content == str(i) diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py new file mode 100644 index 0000000000000000000000000000000000000000..851ab81ee581e74cac41c64c83ef0af75826d6b0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py @@ -0,0 +1,587 @@ +from hashlib import md5 +from itertools import product + +import pytest + +from fsspec.implementations.local import make_path_posix +from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS + + +class AbstractGetTests: + def test_get_file_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1a + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + assert local_fs.isdir(target) + + target_file2 = local_join(target, "file2") + target_subfile1 = local_join(target, "subfile1") + + # Copy from source directory + fs.get(fs_join(source, "file2"), target) + assert local_fs.isfile(target_file2) + + # Copy from sub directory + fs.get(fs_join(source, "subdir", "subfile1"), target) + assert local_fs.isfile(target_subfile1) + + # Remove copied files + local_fs.rm([target_file2, target_subfile1]) + assert not local_fs.exists(target_file2) + assert not local_fs.exists(target_subfile1) + + # Repeat with trailing slash on target + fs.get(fs_join(source, "file2"), target + "/") + assert local_fs.isdir(target) + assert local_fs.isfile(target_file2) + + fs.get(fs_join(source, "subdir", "subfile1"), target + "/") + assert local_fs.isfile(target_subfile1) + + def test_get_file_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1b + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get( + fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/") + ) # Note trailing slash + + assert local_fs.isdir(target) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + + def test_get_file_to_file_in_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1c + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile")) + assert local_fs.isfile(local_join(target, "newfile")) + + def test_get_file_to_file_in_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1d + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get( + fs_join(source, "subdir", "subfile1"), + local_join(target, "newdir", "newfile"), + ) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "newfile")) + + def test_get_directory_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1e + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + assert local_fs.isdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = target + "/" if target_slash else target + + # Without recursive does nothing + fs.get(s, t) + assert local_fs.ls(target) == [] + + # With recursive + fs.get(s, t, recursive=True) + if source_slash: + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert local_fs.isdir(local_join(target, "nesteddir")) + assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + local_join(target, "nesteddir"), + ], + recursive=True, + ) + else: + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile1")) + assert local_fs.isfile(local_join(target, "subdir", "subfile2")) + assert local_fs.isdir(local_join(target, "subdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "subdir", "nesteddir", "nestedfile") + ) + + local_fs.rm(local_join(target, "subdir"), recursive=True) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get(s, t, recursive=True, maxdepth=1) + if source_slash: + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.exists(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + else: + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile1")) + assert local_fs.isfile(local_join(target, "subdir", "subfile2")) + assert not local_fs.exists(local_join(target, "subdir", "nesteddir")) + + local_fs.rm(local_join(target, "subdir"), recursive=True) + assert local_fs.ls(target) == [] + + def test_get_directory_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1f + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = local_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive does nothing + fs.get(s, t) + assert local_fs.ls(target) == [] + + # With recursive + fs.get(s, t, recursive=True) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert local_fs.isdir(local_join(target, "newdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get(s, t, recursive=True, maxdepth=1) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + def test_get_glob_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1g + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + # Without recursive + fs.get(fs_join(source, "subdir", "*"), t) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.isdir(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.get(fs_join(source, "subdir", glob), t, recursive=recursive) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert local_fs.isdir(local_join(target, "nesteddir")) + assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + local_join(target, "nesteddir"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.exists(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + def test_get_glob_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1h + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for target_slash in [False, True]: + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive + fs.get(fs_join(source, "subdir", "*"), t) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert local_fs.ls(target) == [] + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.get(fs_join(source, "subdir", glob), t, recursive=recursive) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert local_fs.isdir(local_join(target, "newdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.get( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_fs.ls(target, detail=False), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + @pytest.mark.parametrize( + GLOB_EDGE_CASES_TESTS["argnames"], + GLOB_EDGE_CASES_TESTS["argvalues"], + ) + def test_get_glob_edge_cases( + self, + path, + recursive, + maxdepth, + expected, + fs, + fs_join, + fs_glob_edge_cases_files, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1g + source = fs_glob_edge_cases_files + + target = local_target + + for new_dir, target_slash in product([True, False], [True, False]): + local_fs.mkdir(target) + + t = local_join(target, "newdir") if new_dir else target + t = t + "/" if target_slash else t + + fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth) + + output = local_fs.find(target) + if new_dir: + prefixed_expected = [ + make_path_posix(local_join(target, "newdir", p)) for p in expected + ] + else: + prefixed_expected = [ + make_path_posix(local_join(target, p)) for p in expected + ] + assert sorted(output) == sorted(prefixed_expected) + + try: + local_fs.rm(target, recursive=True) + except FileNotFoundError: + pass + + def test_get_list_of_files_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 2a + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + fs.get(source_files, t) + assert local_fs.isfile(local_join(target, "file1")) + assert local_fs.isfile(local_join(target, "file2")) + assert local_fs.isfile(local_join(target, "subfile1")) + + local_fs.rm( + [ + local_join(target, "file1"), + local_join(target, "file2"), + local_join(target, "subfile1"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + def test_get_list_of_files_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 2b + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "file1")) + assert local_fs.isfile(local_join(target, "newdir", "file2")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + + def test_get_directory_recursive( + self, fs, fs_join, fs_path, local_fs, local_join, local_target + ): + # https://github.com/fsspec/filesystem_spec/issues/1062 + # Recursive cp/get/put of source directory into non-existent target directory. + src = fs_join(fs_path, "src") + src_file = fs_join(src, "file") + fs.mkdir(src) + fs.touch(src_file) + + target = local_target + + # get without slash + assert not local_fs.exists(target) + for loop in range(2): + fs.get(src, target, recursive=True) + assert local_fs.isdir(target) + + if loop == 0: + assert local_fs.isfile(local_join(target, "file")) + assert not local_fs.exists(local_join(target, "src")) + else: + assert local_fs.isfile(local_join(target, "file")) + assert local_fs.isdir(local_join(target, "src")) + assert local_fs.isfile(local_join(target, "src", "file")) + + local_fs.rm(target, recursive=True) + + # get with slash + assert not local_fs.exists(target) + for loop in range(2): + fs.get(src + "/", target, recursive=True) + assert local_fs.isdir(target) + assert local_fs.isfile(local_join(target, "file")) + assert not local_fs.exists(local_join(target, "src")) + + def test_get_directory_without_files_with_same_name_prefix( + self, + fs, + fs_join, + local_fs, + local_join, + local_target, + fs_dir_and_file_with_same_name_prefix, + ): + # Create the test dirs + source = fs_dir_and_file_with_same_name_prefix + target = local_target + + # Test without glob + fs.get(fs_join(source, "subdir"), target, recursive=True) + + assert local_fs.isfile(local_join(target, "subfile.txt")) + assert not local_fs.isfile(local_join(target, "subdir.txt")) + + local_fs.rm([local_join(target, "subfile.txt")]) + assert local_fs.ls(target) == [] + + # Test with glob + fs.get(fs_join(source, "subdir*"), target, recursive=True) + + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile.txt")) + assert local_fs.isfile(local_join(target, "subdir.txt")) + + def test_get_with_source_and_destination_as_list( + self, + fs, + fs_join, + local_fs, + local_join, + local_target, + fs_10_files_with_hashed_names, + ): + # Create the test dir + source = fs_10_files_with_hashed_names + target = local_target + + # Create list of files for source and destination + source_files = [] + destination_files = [] + for i in range(10): + hashed_i = md5(str(i).encode("utf-8")).hexdigest() + source_files.append(fs_join(source, f"{hashed_i}.txt")) + destination_files.append( + make_path_posix(local_join(target, f"{hashed_i}.txt")) + ) + + # Copy and assert order was kept + fs.get(rpath=source_files, lpath=destination_files) + + for i in range(10): + file_content = local_fs.cat(destination_files[i]).decode("utf-8") + assert file_content == str(i) diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/mv.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/mv.py new file mode 100644 index 0000000000000000000000000000000000000000..39f6caa3de815e024fa84de2acecc986c823ed29 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/mv.py @@ -0,0 +1,57 @@ +import os + +import pytest + +import fsspec + + +def test_move_raises_error_with_tmpdir(tmpdir): + # Create a file in the temporary directory + source = tmpdir.join("source_file.txt") + source.write("content") + + # Define a destination that simulates a protected or invalid path + destination = tmpdir.join("non_existent_directory/destination_file.txt") + + # Instantiate the filesystem (assuming the local file system interface) + fs = fsspec.filesystem("file") + + # Use the actual file paths as string + with pytest.raises(FileNotFoundError): + fs.mv(str(source), str(destination)) + + +@pytest.mark.parametrize("recursive", (True, False)) +def test_move_raises_error_with_tmpdir_permission(recursive, tmpdir): + # Create a file in the temporary directory + source = tmpdir.join("source_file.txt") + source.write("content") + + # Create a protected directory (non-writable) + protected_dir = tmpdir.mkdir("protected_directory") + protected_path = str(protected_dir) + + # Set the directory to read-only + if os.name == "nt": + os.system(f'icacls "{protected_path}" /deny Everyone:(W)') + else: + os.chmod(protected_path, 0o555) # Sets the directory to read-only + + # Define a destination inside the protected directory + destination = protected_dir.join("destination_file.txt") + + # Instantiate the filesystem (assuming the local file system interface) + fs = fsspec.filesystem("file") + + # Try to move the file to the read-only directory, expecting a permission error + with pytest.raises(PermissionError): + fs.mv(str(source), str(destination), recursive=recursive) + + # Assert the file was not created in the destination + assert not os.path.exists(destination) + + # Cleanup: Restore permissions so the directory can be cleaned up + if os.name == "nt": + os.system(f'icacls "{protected_path}" /remove:d Everyone') + else: + os.chmod(protected_path, 0o755) # Restore write permission for cleanup diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/pipe.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/pipe.py new file mode 100644 index 0000000000000000000000000000000000000000..8ecca96e9d23ff268a253c48269d5cca451ea270 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/pipe.py @@ -0,0 +1,11 @@ +import pytest + + +class AbstractPipeTests: + def test_pipe_exclusive(self, fs, fs_target): + fs.pipe_file(fs_target, b"data") + assert fs.cat_file(fs_target) == b"data" + with pytest.raises(FileExistsError): + fs.pipe_file(fs_target, b"data", mode="create") + fs.pipe_file(fs_target, b"new data", mode="overwrite") + assert fs.cat_file(fs_target) == b"new data" diff --git a/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py new file mode 100644 index 0000000000000000000000000000000000000000..9fc349977f0384d9fc86126498be5c6ad99a21d3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py @@ -0,0 +1,591 @@ +from hashlib import md5 +from itertools import product + +import pytest + +from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS + + +class AbstractPutTests: + def test_put_file_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1a + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + target_file2 = fs_join(target, "file2") + target_subfile1 = fs_join(target, "subfile1") + + # Copy from source directory + fs.put(local_join(source, "file2"), target) + assert fs.isfile(target_file2) + + # Copy from sub directory + fs.put(local_join(source, "subdir", "subfile1"), target) + assert fs.isfile(target_subfile1) + + # Remove copied files + fs.rm([target_file2, target_subfile1]) + assert not fs.exists(target_file2) + assert not fs.exists(target_subfile1) + + # Repeat with trailing slash on target + fs.put(local_join(source, "file2"), target + "/") + assert fs.isdir(target) + assert fs.isfile(target_file2) + + fs.put(local_join(source, "subdir", "subfile1"), target + "/") + assert fs.isfile(target_subfile1) + + def test_put_file_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1b + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.put( + local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/") + ) # Note trailing slash + assert fs.isdir(target) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_put_file_to_file_in_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + supports_empty_directories, + local_bulk_operations_scenario_0, + ): + # Copy scenario 1c + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile")) + assert fs.isfile(fs_join(target, "newfile")) + + def test_put_file_to_file_in_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1d + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.put( + local_join(source, "subdir", "subfile1"), + fs_join(target, "newdir", "newfile"), + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "newfile")) + + def test_put_directory_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1e + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = target + "/" if target_slash else target + + # Without recursive does nothing + fs.put(s, t) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + fs.put(s, t, recursive=True) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert fs.isdir(fs_join(target, "subdir", "nesteddir")) + assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.put(s, t, recursive=True, maxdepth=1) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert not fs.exists(fs_join(target, "subdir", "nesteddir")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_directory_to_new_directory( + self, + fs, + fs_join, + fs_target, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1f + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive does nothing + fs.put(s, t) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + with pytest.raises(FileNotFoundError): + fs.ls(target) + + # With recursive + fs.put(s, t, recursive=True) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.put(s, t, recursive=True, maxdepth=1) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + def test_put_glob_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + supports_empty_directories, + local_bulk_operations_scenario_0, + ): + # Copy scenario 1g + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + # Without recursive + fs.put(local_join(source, "subdir", "*"), t) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.isdir(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.put(local_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.put( + local_join(source, "subdir", glob), + t, + recursive=recursive, + maxdepth=1, + ) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_glob_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1h + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for target_slash in [False, True]: + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive + fs.put(local_join(source, "subdir", "*"), t) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.put(local_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.put( + local_join(source, "subdir", glob), + t, + recursive=recursive, + maxdepth=1, + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + @pytest.mark.parametrize( + GLOB_EDGE_CASES_TESTS["argnames"], + GLOB_EDGE_CASES_TESTS["argvalues"], + ) + def test_put_glob_edge_cases( + self, + path, + recursive, + maxdepth, + expected, + fs, + fs_join, + fs_target, + local_glob_edge_cases_files, + local_join, + fs_sanitize_path, + ): + # Copy scenario 1g + source = local_glob_edge_cases_files + + target = fs_target + + for new_dir, target_slash in product([True, False], [True, False]): + fs.mkdir(target) + + t = fs_join(target, "newdir") if new_dir else target + t = t + "/" if target_slash else t + + fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth) + + output = fs.find(target) + if new_dir: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected + ] + else: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, p)) for p in expected + ] + assert sorted(output) == sorted(prefixed_expected) + + try: + fs.rm(target, recursive=True) + except FileNotFoundError: + pass + + def test_put_list_of_files_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 2a + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + source_files = [ + local_join(source, "file1"), + local_join(source, "file2"), + local_join(source, "subdir", "subfile1"), + ] + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + fs.put(source_files, t) + assert fs.isfile(fs_join(target, "file1")) + assert fs.isfile(fs_join(target, "file2")) + assert fs.isfile(fs_join(target, "subfile1")) + + fs.rm( + [ + fs_join(target, "file1"), + fs_join(target, "file2"), + fs_join(target, "subfile1"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_list_of_files_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 2b + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + source_files = [ + local_join(source, "file1"), + local_join(source, "file2"), + local_join(source, "subdir", "subfile1"), + ] + + fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "file1")) + assert fs.isfile(fs_join(target, "newdir", "file2")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_put_directory_recursive( + self, fs, fs_join, fs_target, local_fs, local_join, local_path + ): + # https://github.com/fsspec/filesystem_spec/issues/1062 + # Recursive cp/get/put of source directory into non-existent target directory. + src = local_join(local_path, "src") + src_file = local_join(src, "file") + local_fs.mkdir(src) + local_fs.touch(src_file) + + target = fs_target + + # put without slash + assert not fs.exists(target) + for loop in range(2): + fs.put(src, target, recursive=True) + assert fs.isdir(target) + + if loop == 0: + assert fs.isfile(fs_join(target, "file")) + assert not fs.exists(fs_join(target, "src")) + else: + assert fs.isfile(fs_join(target, "file")) + assert fs.isdir(fs_join(target, "src")) + assert fs.isfile(fs_join(target, "src", "file")) + + fs.rm(target, recursive=True) + + # put with slash + assert not fs.exists(target) + for loop in range(2): + fs.put(src + "/", target, recursive=True) + assert fs.isdir(target) + assert fs.isfile(fs_join(target, "file")) + assert not fs.exists(fs_join(target, "src")) + + def test_put_directory_without_files_with_same_name_prefix( + self, + fs, + fs_join, + fs_target, + local_join, + local_dir_and_file_with_same_name_prefix, + supports_empty_directories, + ): + # Create the test dirs + source = local_dir_and_file_with_same_name_prefix + target = fs_target + + # Test without glob + fs.put(local_join(source, "subdir"), fs_target, recursive=True) + + assert fs.isfile(fs_join(fs_target, "subfile.txt")) + assert not fs.isfile(fs_join(fs_target, "subdir.txt")) + + fs.rm([fs_join(target, "subfile.txt")]) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + assert not fs.exists(target) + + # Test with glob + fs.put(local_join(source, "subdir*"), fs_target, recursive=True) + + assert fs.isdir(fs_join(fs_target, "subdir")) + assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt")) + assert fs.isfile(fs_join(fs_target, "subdir.txt")) + + def test_copy_with_source_and_destination_as_list( + self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names + ): + # Create the test dir + source = local_10_files_with_hashed_names + target = fs_target + + # Create list of files for source and destination + source_files = [] + destination_files = [] + for i in range(10): + hashed_i = md5(str(i).encode("utf-8")).hexdigest() + source_files.append(local_join(source, f"{hashed_i}.txt")) + destination_files.append(fs_join(target, f"{hashed_i}.txt")) + + # Copy and assert order was kept + fs.put(lpath=source_files, rpath=destination_files) + + for i in range(10): + file_content = fs.cat(destination_files[i]).decode("utf-8") + assert file_content == str(i) diff --git a/.venv/lib/python3.11/site-packages/zmq/_future.py b/.venv/lib/python3.11/site-packages/zmq/_future.py new file mode 100644 index 0000000000000000000000000000000000000000..388284e740d67a25ee0305d4d755afbf9e0acac7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/zmq/_future.py @@ -0,0 +1,706 @@ +"""Future-returning APIs for coroutines.""" + +# Copyright (c) PyZMQ Developers. +# Distributed under the terms of the Modified BSD License. +from __future__ import annotations + +import warnings +from asyncio import Future +from collections import deque +from functools import partial +from itertools import chain +from typing import ( + Any, + Awaitable, + Callable, + NamedTuple, + TypeVar, + cast, +) + +import zmq as _zmq +from zmq import EVENTS, POLLIN, POLLOUT + + +class _FutureEvent(NamedTuple): + future: Future + kind: str + kwargs: dict + msg: Any + timer: Any + + +# These are incomplete classes and need a Mixin for compatibility with an eventloop +# defining the following attributes: +# +# _Future +# _READ +# _WRITE +# _default_loop() + + +class _Async: + """Mixin for common async logic""" + + _current_loop: Any = None + _Future: type[Future] + + def _get_loop(self) -> Any: + """Get event loop + + Notice if event loop has changed, + and register init_io_state on activation of a new event loop + """ + if self._current_loop is None: + self._current_loop = self._default_loop() + self._init_io_state(self._current_loop) + return self._current_loop + current_loop = self._default_loop() + if current_loop is not self._current_loop: + # warn? This means a socket is being used in multiple loops! + self._current_loop = current_loop + self._init_io_state(current_loop) + return current_loop + + def _default_loop(self) -> Any: + raise NotImplementedError("Must be implemented in a subclass") + + def _init_io_state(self, loop=None) -> None: + pass + + +class _AsyncPoller(_Async, _zmq.Poller): + """Poller that returns a Future on poll, instead of blocking.""" + + _socket_class: type[_AsyncSocket] + _READ: int + _WRITE: int + raw_sockets: list[Any] + + def _watch_raw_socket(self, loop: Any, socket: Any, evt: int, f: Callable) -> None: + """Schedule callback for a raw socket""" + raise NotImplementedError() + + def _unwatch_raw_sockets(self, loop: Any, *sockets: Any) -> None: + """Unschedule callback for a raw socket""" + raise NotImplementedError() + + def poll(self, timeout=-1) -> Awaitable[list[tuple[Any, int]]]: # type: ignore + """Return a Future for a poll event""" + future = self._Future() + if timeout == 0: + try: + result = super().poll(0) + except Exception as e: + future.set_exception(e) + else: + future.set_result(result) + return future + + loop = self._get_loop() + + # register Future to be called as soon as any event is available on any socket + watcher = self._Future() + + # watch raw sockets: + raw_sockets: list[Any] = [] + + def wake_raw(*args): + if not watcher.done(): + watcher.set_result(None) + + watcher.add_done_callback( + lambda f: self._unwatch_raw_sockets(loop, *raw_sockets) + ) + + wrapped_sockets: list[_AsyncSocket] = [] + + def _clear_wrapper_io(f): + for s in wrapped_sockets: + s._clear_io_state() + + for socket, mask in self.sockets: + if isinstance(socket, _zmq.Socket): + if not isinstance(socket, self._socket_class): + # it's a blocking zmq.Socket, wrap it in async + socket = self._socket_class.from_socket(socket) + wrapped_sockets.append(socket) + if mask & _zmq.POLLIN: + socket._add_recv_event('poll', future=watcher) + if mask & _zmq.POLLOUT: + socket._add_send_event('poll', future=watcher) + else: + raw_sockets.append(socket) + evt = 0 + if mask & _zmq.POLLIN: + evt |= self._READ + if mask & _zmq.POLLOUT: + evt |= self._WRITE + self._watch_raw_socket(loop, socket, evt, wake_raw) + + def on_poll_ready(f): + if future.done(): + return + if watcher.cancelled(): + try: + future.cancel() + except RuntimeError: + # RuntimeError may be called during teardown + pass + return + if watcher.exception(): + future.set_exception(watcher.exception()) + else: + try: + result = super(_AsyncPoller, self).poll(0) + except Exception as e: + future.set_exception(e) + else: + future.set_result(result) + + watcher.add_done_callback(on_poll_ready) + + if wrapped_sockets: + watcher.add_done_callback(_clear_wrapper_io) + + if timeout is not None and timeout > 0: + # schedule cancel to fire on poll timeout, if any + def trigger_timeout(): + if not watcher.done(): + watcher.set_result(None) + + timeout_handle = loop.call_later(1e-3 * timeout, trigger_timeout) + + def cancel_timeout(f): + if hasattr(timeout_handle, 'cancel'): + timeout_handle.cancel() + else: + loop.remove_timeout(timeout_handle) + + future.add_done_callback(cancel_timeout) + + def cancel_watcher(f): + if not watcher.done(): + watcher.cancel() + + future.add_done_callback(cancel_watcher) + + return future + + +class _NoTimer: + @staticmethod + def cancel(): + pass + + +T = TypeVar("T", bound="_AsyncSocket") + + +class _AsyncSocket(_Async, _zmq.Socket[Future]): + # Warning : these class variables are only here to allow to call super().__setattr__. + # They be overridden at instance initialization and not shared in the whole class + _recv_futures = None + _send_futures = None + _state = 0 + _shadow_sock: _zmq.Socket + _poller_class = _AsyncPoller + _fd = None + + def __init__( + self, + context=None, + socket_type=-1, + io_loop=None, + _from_socket: _zmq.Socket | None = None, + **kwargs, + ) -> None: + if isinstance(context, _zmq.Socket): + context, _from_socket = (None, context) + if _from_socket is not None: + super().__init__(shadow=_from_socket.underlying) # type: ignore + self._shadow_sock = _from_socket + else: + super().__init__(context, socket_type, **kwargs) # type: ignore + self._shadow_sock = _zmq.Socket.shadow(self.underlying) + + if io_loop is not None: + warnings.warn( + f"{self.__class__.__name__}(io_loop) argument is deprecated in pyzmq 22.2." + " The currently active loop will always be used.", + DeprecationWarning, + stacklevel=3, + ) + self._recv_futures = deque() + self._send_futures = deque() + self._state = 0 + self._fd = self._shadow_sock.FD + + @classmethod + def from_socket(cls: type[T], socket: _zmq.Socket, io_loop: Any = None) -> T: + """Create an async socket from an existing Socket""" + return cls(_from_socket=socket, io_loop=io_loop) + + def close(self, linger: int | None = None) -> None: + if not self.closed and self._fd is not None: + event_list: list[_FutureEvent] = list( + chain(self._recv_futures or [], self._send_futures or []) + ) + for event in event_list: + if not event.future.done(): + try: + event.future.cancel() + except RuntimeError: + # RuntimeError may be called during teardown + pass + self._clear_io_state() + super().close(linger=linger) + + close.__doc__ = _zmq.Socket.close.__doc__ + + def get(self, key): + result = super().get(key) + if key == EVENTS: + self._schedule_remaining_events(result) + return result + + get.__doc__ = _zmq.Socket.get.__doc__ + + def recv_multipart( + self, flags: int = 0, copy: bool = True, track: bool = False + ) -> Awaitable[list[bytes] | list[_zmq.Frame]]: + """Receive a complete multipart zmq message. + + Returns a Future whose result will be a multipart message. + """ + return self._add_recv_event( + 'recv_multipart', dict(flags=flags, copy=copy, track=track) + ) + + def recv( # type: ignore + self, flags: int = 0, copy: bool = True, track: bool = False + ) -> Awaitable[bytes | _zmq.Frame]: + """Receive a single zmq frame. + + Returns a Future, whose result will be the received frame. + + Recommend using recv_multipart instead. + """ + return self._add_recv_event('recv', dict(flags=flags, copy=copy, track=track)) + + def send_multipart( # type: ignore + self, msg_parts: Any, flags: int = 0, copy: bool = True, track=False, **kwargs + ) -> Awaitable[_zmq.MessageTracker | None]: + """Send a complete multipart zmq message. + + Returns a Future that resolves when sending is complete. + """ + kwargs['flags'] = flags + kwargs['copy'] = copy + kwargs['track'] = track + return self._add_send_event('send_multipart', msg=msg_parts, kwargs=kwargs) + + def send( # type: ignore + self, + data: Any, + flags: int = 0, + copy: bool = True, + track: bool = False, + **kwargs: Any, + ) -> Awaitable[_zmq.MessageTracker | None]: + """Send a single zmq frame. + + Returns a Future that resolves when sending is complete. + + Recommend using send_multipart instead. + """ + kwargs['flags'] = flags + kwargs['copy'] = copy + kwargs['track'] = track + kwargs.update(dict(flags=flags, copy=copy, track=track)) + return self._add_send_event('send', msg=data, kwargs=kwargs) + + def _deserialize(self, recvd, load): + """Deserialize with Futures""" + f = self._Future() + + def _chain(_): + """Chain result through serialization to recvd""" + if f.done(): + # chained future may be cancelled, which means nobody is going to get this result + # if it's an error, that's no big deal (probably zmq.Again), + # but if it's a successful recv, this is a dropped message! + if not recvd.cancelled() and recvd.exception() is None: + warnings.warn( + # is there a useful stacklevel? + # ideally, it would point to where `f.cancel()` was called + f"Future {f} completed while awaiting {recvd}. A message has been dropped!", + RuntimeWarning, + ) + return + if recvd.exception(): + f.set_exception(recvd.exception()) + else: + buf = recvd.result() + try: + loaded = load(buf) + except Exception as e: + f.set_exception(e) + else: + f.set_result(loaded) + + recvd.add_done_callback(_chain) + + def _chain_cancel(_): + """Chain cancellation from f to recvd""" + if recvd.done(): + return + if f.cancelled(): + recvd.cancel() + + f.add_done_callback(_chain_cancel) + + return f + + def poll(self, timeout=None, flags=_zmq.POLLIN) -> Awaitable[int]: # type: ignore + """poll the socket for events + + returns a Future for the poll results. + """ + + if self.closed: + raise _zmq.ZMQError(_zmq.ENOTSUP) + + p = self._poller_class() + p.register(self, flags) + poll_future = cast(Future, p.poll(timeout)) + + future = self._Future() + + def unwrap_result(f): + if future.done(): + return + if poll_future.cancelled(): + try: + future.cancel() + except RuntimeError: + # RuntimeError may be called during teardown + pass + return + if f.exception(): + future.set_exception(poll_future.exception()) + else: + evts = dict(poll_future.result()) + future.set_result(evts.get(self, 0)) + + if poll_future.done(): + # hook up result if already done + unwrap_result(poll_future) + else: + poll_future.add_done_callback(unwrap_result) + + def cancel_poll(future): + """Cancel underlying poll if request has been cancelled""" + if not poll_future.done(): + try: + poll_future.cancel() + except RuntimeError: + # RuntimeError may be called during teardown + pass + + future.add_done_callback(cancel_poll) + + return future + + def _add_timeout(self, future, timeout): + """Add a timeout for a send or recv Future""" + + def future_timeout(): + if future.done(): + # future already resolved, do nothing + return + + # raise EAGAIN + future.set_exception(_zmq.Again()) + + return self._call_later(timeout, future_timeout) + + def _call_later(self, delay, callback): + """Schedule a function to be called later + + Override for different IOLoop implementations + + Tornado and asyncio happen to both have ioloop.call_later + with the same signature. + """ + return self._get_loop().call_later(delay, callback) + + @staticmethod + def _remove_finished_future(future, event_list, event=None): + """Make sure that futures are removed from the event list when they resolve + + Avoids delaying cleanup until the next send/recv event, + which may never come. + """ + # "future" instance is shared between sockets, but each socket has its own event list. + if not event_list: + return + # only unconsumed events (e.g. cancelled calls) + # will be present when this happens + try: + event_list.remove(event) + except ValueError: + # usually this will have been removed by being consumed + return + + def _add_recv_event(self, kind, kwargs=None, future=None): + """Add a recv event, returning the corresponding Future""" + f = future or self._Future() + if kind.startswith('recv') and kwargs.get('flags', 0) & _zmq.DONTWAIT: + # short-circuit non-blocking calls + recv = getattr(self._shadow_sock, kind) + try: + r = recv(**kwargs) + except Exception as e: + f.set_exception(e) + else: + f.set_result(r) + return f + + timer = _NoTimer + if hasattr(_zmq, 'RCVTIMEO'): + timeout_ms = self._shadow_sock.rcvtimeo + if timeout_ms >= 0: + timer = self._add_timeout(f, timeout_ms * 1e-3) + + # we add it to the list of futures before we add the timeout as the + # timeout will remove the future from recv_futures to avoid leaks + _future_event = _FutureEvent(f, kind, kwargs, msg=None, timer=timer) + self._recv_futures.append(_future_event) + + if self._shadow_sock.get(EVENTS) & POLLIN: + # recv immediately, if we can + self._handle_recv() + if self._recv_futures and _future_event in self._recv_futures: + # Don't let the Future sit in _recv_events after it's done + # no need to register this if we've already been handled + # (i.e. immediately-resolved recv) + f.add_done_callback( + partial( + self._remove_finished_future, + event_list=self._recv_futures, + event=_future_event, + ) + ) + self._add_io_state(POLLIN) + return f + + def _add_send_event(self, kind, msg=None, kwargs=None, future=None): + """Add a send event, returning the corresponding Future""" + f = future or self._Future() + # attempt send with DONTWAIT if no futures are waiting + # short-circuit for sends that will resolve immediately + # only call if no send Futures are waiting + if kind in ('send', 'send_multipart') and not self._send_futures: + flags = kwargs.get('flags', 0) + nowait_kwargs = kwargs.copy() + nowait_kwargs['flags'] = flags | _zmq.DONTWAIT + + # short-circuit non-blocking calls + send = getattr(self._shadow_sock, kind) + # track if the send resolved or not + # (EAGAIN if DONTWAIT is not set should proceed with) + finish_early = True + try: + r = send(msg, **nowait_kwargs) + except _zmq.Again as e: + if flags & _zmq.DONTWAIT: + f.set_exception(e) + else: + # EAGAIN raised and DONTWAIT not requested, + # proceed with async send + finish_early = False + except Exception as e: + f.set_exception(e) + else: + f.set_result(r) + + if finish_early: + # short-circuit resolved, return finished Future + # schedule wake for recv if there are any receivers waiting + if self._recv_futures: + self._schedule_remaining_events() + return f + + timer = _NoTimer + if hasattr(_zmq, 'SNDTIMEO'): + timeout_ms = self._shadow_sock.get(_zmq.SNDTIMEO) + if timeout_ms >= 0: + timer = self._add_timeout(f, timeout_ms * 1e-3) + + # we add it to the list of futures before we add the timeout as the + # timeout will remove the future from recv_futures to avoid leaks + _future_event = _FutureEvent(f, kind, kwargs=kwargs, msg=msg, timer=timer) + self._send_futures.append(_future_event) + # Don't let the Future sit in _send_futures after it's done + f.add_done_callback( + partial( + self._remove_finished_future, + event_list=self._send_futures, + event=_future_event, + ) + ) + + self._add_io_state(POLLOUT) + return f + + def _handle_recv(self): + """Handle recv events""" + if not self._shadow_sock.get(EVENTS) & POLLIN: + # event triggered, but state may have been changed between trigger and callback + return + f = None + while self._recv_futures: + f, kind, kwargs, _, timer = self._recv_futures.popleft() + # skip any cancelled futures + if f.done(): + f = None + else: + break + + if not self._recv_futures: + self._drop_io_state(POLLIN) + + if f is None: + return + + timer.cancel() + + if kind == 'poll': + # on poll event, just signal ready, nothing else. + f.set_result(None) + return + elif kind == 'recv_multipart': + recv = self._shadow_sock.recv_multipart + elif kind == 'recv': + recv = self._shadow_sock.recv + else: + raise ValueError(f"Unhandled recv event type: {kind!r}") + + kwargs['flags'] |= _zmq.DONTWAIT + try: + result = recv(**kwargs) + except Exception as e: + f.set_exception(e) + else: + f.set_result(result) + + def _handle_send(self): + if not self._shadow_sock.get(EVENTS) & POLLOUT: + # event triggered, but state may have been changed between trigger and callback + return + f = None + while self._send_futures: + f, kind, kwargs, msg, timer = self._send_futures.popleft() + # skip any cancelled futures + if f.done(): + f = None + else: + break + + if not self._send_futures: + self._drop_io_state(POLLOUT) + + if f is None: + return + + timer.cancel() + + if kind == 'poll': + # on poll event, just signal ready, nothing else. + f.set_result(None) + return + elif kind == 'send_multipart': + send = self._shadow_sock.send_multipart + elif kind == 'send': + send = self._shadow_sock.send + else: + raise ValueError(f"Unhandled send event type: {kind!r}") + + kwargs['flags'] |= _zmq.DONTWAIT + try: + result = send(msg, **kwargs) + except Exception as e: + f.set_exception(e) + else: + f.set_result(result) + + # event masking from ZMQStream + def _handle_events(self, fd=0, events=0): + """Dispatch IO events to _handle_recv, etc.""" + if self._shadow_sock.closed: + return + + zmq_events = self._shadow_sock.get(EVENTS) + if zmq_events & _zmq.POLLIN: + self._handle_recv() + if zmq_events & _zmq.POLLOUT: + self._handle_send() + self._schedule_remaining_events() + + def _schedule_remaining_events(self, events=None): + """Schedule a call to handle_events next loop iteration + + If there are still events to handle. + """ + # edge-triggered handling + # allow passing events in, in case this is triggered by retrieving events, + # so we don't have to retrieve it twice. + if self._state == 0: + # not watching for anything, nothing to schedule + return + if events is None: + events = self._shadow_sock.get(EVENTS) + if events & self._state: + self._call_later(0, self._handle_events) + + def _add_io_state(self, state): + """Add io_state to poller.""" + if self._state != state: + state = self._state = self._state | state + self._update_handler(self._state) + + def _drop_io_state(self, state): + """Stop poller from watching an io_state.""" + if self._state & state: + self._state = self._state & (~state) + self._update_handler(self._state) + + def _update_handler(self, state): + """Update IOLoop handler with state. + + zmq FD is always read-only. + """ + # ensure loop is registered and init_io has been called + # if there are any events to watch for + if state: + self._get_loop() + self._schedule_remaining_events() + + def _init_io_state(self, loop=None): + """initialize the ioloop event handler""" + if loop is None: + loop = self._get_loop() + loop.add_handler(self._shadow_sock, self._handle_events, self._READ) + self._call_later(0, self._handle_events) + + def _clear_io_state(self): + """unregister the ioloop event handler + + called once during close + """ + fd = self._shadow_sock + if self._shadow_sock.closed: + fd = self._fd + if self._current_loop is not None: + self._current_loop.remove_handler(fd) diff --git a/.venv/lib/python3.11/site-packages/zmq/asyncio.py b/.venv/lib/python3.11/site-packages/zmq/asyncio.py new file mode 100644 index 0000000000000000000000000000000000000000..22bbd14d423cc2fb31efa7846681ab4420a2a6fd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/zmq/asyncio.py @@ -0,0 +1,224 @@ +"""AsyncIO support for zmq + +Requires asyncio and Python 3. +""" + +# Copyright (c) PyZMQ Developers. +# Distributed under the terms of the Modified BSD License. +from __future__ import annotations + +import asyncio +import selectors +import sys +import warnings +from asyncio import Future, SelectorEventLoop +from weakref import WeakKeyDictionary + +import zmq as _zmq +from zmq import _future + +# registry of asyncio loop : selector thread +_selectors: WeakKeyDictionary = WeakKeyDictionary() + + +class ProactorSelectorThreadWarning(RuntimeWarning): + """Warning class for notifying about the extra thread spawned by tornado + + We automatically support proactor via tornado's AddThreadSelectorEventLoop""" + + +def _get_selector_windows( + asyncio_loop, +) -> asyncio.AbstractEventLoop: + """Get selector-compatible loop + + Returns an object with ``add_reader`` family of methods, + either the loop itself or a SelectorThread instance. + + Workaround Windows proactor removal of + *reader methods, which we need for zmq sockets. + """ + + if asyncio_loop in _selectors: + return _selectors[asyncio_loop] + + # detect add_reader instead of checking for proactor? + if hasattr(asyncio, "ProactorEventLoop") and isinstance( + asyncio_loop, + asyncio.ProactorEventLoop, # type: ignore + ): + try: + from tornado.platform.asyncio import AddThreadSelectorEventLoop + except ImportError: + raise RuntimeError( + "Proactor event loop does not implement add_reader family of methods required for zmq." + " zmq will work with proactor if tornado >= 6.1 can be found." + " Use `asyncio.set_event_loop_policy(WindowsSelectorEventLoopPolicy())`" + " or install 'tornado>=6.1' to avoid this error." + ) + + warnings.warn( + "Proactor event loop does not implement add_reader family of methods required for zmq." + " Registering an additional selector thread for add_reader support via tornado." + " Use `asyncio.set_event_loop_policy(WindowsSelectorEventLoopPolicy())`" + " to avoid this warning.", + RuntimeWarning, + # stacklevel 5 matches most likely zmq.asyncio.Context().socket() + stacklevel=5, + ) + + selector_loop = _selectors[asyncio_loop] = AddThreadSelectorEventLoop( + asyncio_loop + ) # type: ignore + + # patch loop.close to also close the selector thread + loop_close = asyncio_loop.close + + def _close_selector_and_loop(): + # restore original before calling selector.close, + # which in turn calls eventloop.close! + asyncio_loop.close = loop_close + _selectors.pop(asyncio_loop, None) + selector_loop.close() + + asyncio_loop.close = _close_selector_and_loop # type: ignore # mypy bug - assign a function to method + return selector_loop + else: + return asyncio_loop + + +def _get_selector_noop(loop) -> asyncio.AbstractEventLoop: + """no-op on non-Windows""" + return loop + + +if sys.platform == "win32": + _get_selector = _get_selector_windows +else: + _get_selector = _get_selector_noop + + +class _AsyncIO: + _Future = Future + _WRITE = selectors.EVENT_WRITE + _READ = selectors.EVENT_READ + + def _default_loop(self): + try: + return asyncio.get_running_loop() + except RuntimeError: + warnings.warn( + "No running event loop. zmq.asyncio should be used from within an asyncio loop.", + RuntimeWarning, + stacklevel=4, + ) + # get_event_loop deprecated in 3.10: + return asyncio.get_event_loop() + + +class Poller(_AsyncIO, _future._AsyncPoller): + """Poller returning asyncio.Future for poll results.""" + + def _watch_raw_socket(self, loop, socket, evt, f): + """Schedule callback for a raw socket""" + selector = _get_selector(loop) + if evt & self._READ: + selector.add_reader(socket, lambda *args: f()) + if evt & self._WRITE: + selector.add_writer(socket, lambda *args: f()) + + def _unwatch_raw_sockets(self, loop, *sockets): + """Unschedule callback for a raw socket""" + selector = _get_selector(loop) + for socket in sockets: + selector.remove_reader(socket) + selector.remove_writer(socket) + + +class Socket(_AsyncIO, _future._AsyncSocket): + """Socket returning asyncio Futures for send/recv/poll methods.""" + + _poller_class = Poller + + def _get_selector(self, io_loop=None): + if io_loop is None: + io_loop = self._get_loop() + return _get_selector(io_loop) + + def _init_io_state(self, io_loop=None): + """initialize the ioloop event handler""" + self._get_selector(io_loop).add_reader( + self._fd, lambda: self._handle_events(0, 0) + ) + + def _clear_io_state(self): + """clear any ioloop event handler + + called once at close + """ + loop = self._current_loop + if loop and not loop.is_closed() and self._fd != -1: + self._get_selector(loop).remove_reader(self._fd) + + +Poller._socket_class = Socket + + +class Context(_zmq.Context[Socket]): + """Context for creating asyncio-compatible Sockets""" + + _socket_class = Socket + + # avoid sharing instance with base Context class + _instance = None + + # overload with no changes to satisfy pyright + def __init__( + self: Context, + io_threads: int | _zmq.Context = 1, + shadow: _zmq.Context | int = 0, + ) -> None: + super().__init__(io_threads, shadow) # type: ignore + + +class ZMQEventLoop(SelectorEventLoop): + """DEPRECATED: AsyncIO eventloop using zmq_poll. + + pyzmq sockets should work with any asyncio event loop as of pyzmq 17. + """ + + def __init__(self, selector=None): + _deprecated() + return super().__init__(selector) + + +_loop = None + + +def _deprecated(): + if _deprecated.called: # type: ignore + return + _deprecated.called = True # type: ignore + + warnings.warn( + "ZMQEventLoop and zmq.asyncio.install are deprecated in pyzmq 17. Special eventloop integration is no longer needed.", + DeprecationWarning, + stacklevel=3, + ) + + +_deprecated.called = False # type: ignore + + +def install(): + """DEPRECATED: No longer needed in pyzmq 17""" + _deprecated() + + +__all__ = [ + "Context", + "Socket", + "Poller", + "ZMQEventLoop", + "install", +] diff --git a/.venv/lib/python3.11/site-packages/zmq/decorators.py b/.venv/lib/python3.11/site-packages/zmq/decorators.py new file mode 100644 index 0000000000000000000000000000000000000000..7cd80ebc764fb0b83146821ea85af3ce4aad8196 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/zmq/decorators.py @@ -0,0 +1,190 @@ +"""Decorators for running functions with context/sockets. + +.. versionadded:: 15.3 + +Like using Contexts and Sockets as context managers, but with decorator syntax. +Context and sockets are closed at the end of the function. + +For example:: + + from zmq.decorators import context, socket + + @context() + @socket(zmq.PUSH) + def work(ctx, push): + ... +""" + +from __future__ import annotations + +# Copyright (c) PyZMQ Developers. +# Distributed under the terms of the Modified BSD License. + +__all__ = ( + 'context', + 'socket', +) + +from functools import wraps + +import zmq + + +class _Decorator: + '''The mini decorator factory''' + + def __init__(self, target=None): + self._target = target + + def __call__(self, *dec_args, **dec_kwargs): + """ + The main logic of decorator + + Here is how those arguments works:: + + @out_decorator(*dec_args, *dec_kwargs) + def func(*wrap_args, **wrap_kwargs): + ... + + And in the ``wrapper``, we simply create ``self.target`` instance via + ``with``:: + + target = self.get_target(*args, **kwargs) + with target(*dec_args, **dec_kwargs) as obj: + ... + + """ + kw_name, dec_args, dec_kwargs = self.process_decorator_args( + *dec_args, **dec_kwargs + ) + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + target = self.get_target(*args, **kwargs) + + with target(*dec_args, **dec_kwargs) as obj: + # insert our object into args + if kw_name and kw_name not in kwargs: + kwargs[kw_name] = obj + elif kw_name and kw_name in kwargs: + raise TypeError( + f"{func.__name__}() got multiple values for" + f" argument '{kw_name}'" + ) + else: + args = args + (obj,) + + return func(*args, **kwargs) + + return wrapper + + return decorator + + def get_target(self, *args, **kwargs): + """Return the target function + + Allows modifying args/kwargs to be passed. + """ + return self._target + + def process_decorator_args(self, *args, **kwargs): + """Process args passed to the decorator. + + args not consumed by the decorator will be passed to the target factory + (Context/Socket constructor). + """ + kw_name = None + + if isinstance(kwargs.get('name'), str): + kw_name = kwargs.pop('name') + elif len(args) >= 1 and isinstance(args[0], str): + kw_name = args[0] + args = args[1:] + + return kw_name, args, kwargs + + +class _ContextDecorator(_Decorator): + """Decorator subclass for Contexts""" + + def __init__(self): + super().__init__(zmq.Context) + + +class _SocketDecorator(_Decorator): + """Decorator subclass for sockets + + Gets the context from other args. + """ + + def process_decorator_args(self, *args, **kwargs): + """Also grab context_name out of kwargs""" + kw_name, args, kwargs = super().process_decorator_args(*args, **kwargs) + self.context_name = kwargs.pop('context_name', 'context') + return kw_name, args, kwargs + + def get_target(self, *args, **kwargs): + """Get context, based on call-time args""" + context = self._get_context(*args, **kwargs) + return context.socket + + def _get_context(self, *args, **kwargs): + """ + Find the ``zmq.Context`` from ``args`` and ``kwargs`` at call time. + + First, if there is an keyword argument named ``context`` and it is a + ``zmq.Context`` instance , we will take it. + + Second, we check all the ``args``, take the first ``zmq.Context`` + instance. + + Finally, we will provide default Context -- ``zmq.Context.instance`` + + :return: a ``zmq.Context`` instance + """ + if self.context_name in kwargs: + ctx = kwargs[self.context_name] + + if isinstance(ctx, zmq.Context): + return ctx + + for arg in args: + if isinstance(arg, zmq.Context): + return arg + # not specified by any decorator + return zmq.Context.instance() + + +def context(*args, **kwargs): + """Decorator for adding a Context to a function. + + Usage:: + + @context() + def foo(ctx): + ... + + .. versionadded:: 15.3 + + :param str name: the keyword argument passed to decorated function + """ + return _ContextDecorator()(*args, **kwargs) + + +def socket(*args, **kwargs): + """Decorator for adding a socket to a function. + + Usage:: + + @socket(zmq.PUSH) + def foo(push): + ... + + .. versionadded:: 15.3 + + :param str name: the keyword argument passed to decorated function + :param str context_name: the keyword only argument to identify context + object + """ + return _SocketDecorator()(*args, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/zmq/error.py b/.venv/lib/python3.11/site-packages/zmq/error.py new file mode 100644 index 0000000000000000000000000000000000000000..ddfa46353d27c38708addc95c438d40f00bed3a7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/zmq/error.py @@ -0,0 +1,212 @@ +"""0MQ Error classes and functions.""" + +# Copyright (C) PyZMQ Developers +# Distributed under the terms of the Modified BSD License. +from __future__ import annotations + +from errno import EINTR + + +class ZMQBaseError(Exception): + """Base exception class for 0MQ errors in Python.""" + + +class ZMQError(ZMQBaseError): + """Wrap an errno style error. + + Parameters + ---------- + errno : int + The ZMQ errno or None. If None, then ``zmq_errno()`` is called and + used. + msg : str + Description of the error or None. + """ + + errno: int | None = None + strerror: str + + def __init__(self, errno: int | None = None, msg: str | None = None): + """Wrap an errno style error. + + Parameters + ---------- + errno : int + The ZMQ errno or None. If None, then ``zmq_errno()`` is called and + used. + msg : string + Description of the error or None. + """ + from zmq.backend import strerror, zmq_errno + + if errno is None: + errno = zmq_errno() + if isinstance(errno, int): + self.errno = errno + if msg is None: + self.strerror = strerror(errno) + else: + self.strerror = msg + else: + if msg is None: + self.strerror = str(errno) + else: + self.strerror = msg + # flush signals, because there could be a SIGINT + # waiting to pounce, resulting in uncaught exceptions. + # Doing this here means getting SIGINT during a blocking + # libzmq call will raise a *catchable* KeyboardInterrupt + # PyErr_CheckSignals() + + def __str__(self) -> str: + return self.strerror + + def __repr__(self) -> str: + return f"{self.__class__.__name__}('{str(self)}')" + + +class ZMQBindError(ZMQBaseError): + """An error for ``Socket.bind_to_random_port()``. + + See Also + -------- + .Socket.bind_to_random_port + """ + + +class NotDone(ZMQBaseError): + """Raised when timeout is reached while waiting for 0MQ to finish with a Message + + See Also + -------- + .MessageTracker.wait : object for tracking when ZeroMQ is done + """ + + +class ContextTerminated(ZMQError): + """Wrapper for zmq.ETERM + + .. versionadded:: 13.0 + """ + + def __init__(self, errno="ignored", msg="ignored"): + from zmq import ETERM + + super().__init__(ETERM) + + +class Again(ZMQError): + """Wrapper for zmq.EAGAIN + + .. versionadded:: 13.0 + """ + + def __init__(self, errno="ignored", msg="ignored"): + from zmq import EAGAIN + + super().__init__(EAGAIN) + + +class InterruptedSystemCall(ZMQError, InterruptedError): + """Wrapper for EINTR + + This exception should be caught internally in pyzmq + to retry system calls, and not propagate to the user. + + .. versionadded:: 14.7 + """ + + errno = EINTR + strerror: str + + def __init__(self, errno="ignored", msg="ignored"): + super().__init__(EINTR) + + def __str__(self): + s = super().__str__() + return s + ": This call should have been retried. Please report this to pyzmq." + + +def _check_rc(rc, errno=None, error_without_errno=True): + """internal utility for checking zmq return condition + + and raising the appropriate Exception class + """ + if rc == -1: + if errno is None: + from zmq.backend import zmq_errno + + errno = zmq_errno() + if errno == 0 and not error_without_errno: + return + from zmq import EAGAIN, ETERM + + if errno == EINTR: + raise InterruptedSystemCall(errno) + elif errno == EAGAIN: + raise Again(errno) + elif errno == ETERM: + raise ContextTerminated(errno) + else: + raise ZMQError(errno) + + +_zmq_version_info = None +_zmq_version = None + + +class ZMQVersionError(NotImplementedError): + """Raised when a feature is not provided by the linked version of libzmq. + + .. versionadded:: 14.2 + """ + + min_version = None + + def __init__(self, min_version: str, msg: str = "Feature"): + global _zmq_version + if _zmq_version is None: + from zmq import zmq_version + + _zmq_version = zmq_version() + self.msg = msg + self.min_version = min_version + self.version = _zmq_version + + def __repr__(self): + return f"ZMQVersionError('{str(self)}')" + + def __str__(self): + return f"{self.msg} requires libzmq >= {self.min_version}, have {self.version}" + + +def _check_version( + min_version_info: tuple[int] | tuple[int, int] | tuple[int, int, int], + msg: str = "Feature", +): + """Check for libzmq + + raises ZMQVersionError if current zmq version is not at least min_version + + min_version_info is a tuple of integers, and will be compared against zmq.zmq_version_info(). + """ + global _zmq_version_info + if _zmq_version_info is None: + from zmq import zmq_version_info + + _zmq_version_info = zmq_version_info() + if _zmq_version_info < min_version_info: + min_version = ".".join(str(v) for v in min_version_info) + raise ZMQVersionError(min_version, msg) + + +__all__ = [ + "ZMQBaseError", + "ZMQBindError", + "ZMQError", + "NotDone", + "ContextTerminated", + "InterruptedSystemCall", + "Again", + "ZMQVersionError", +] diff --git a/.venv/lib/python3.11/site-packages/zmq/py.typed b/.venv/lib/python3.11/site-packages/zmq/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391