Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 24, 2024

Commit

72ea1b4

verified ·

1 Parent(s): 50db311

Upload loaders.py with huggingface_hub

Browse files

Files changed (1) hide show

loaders.py +38 -7

loaders.py CHANGED Viewed

@@ -203,8 +203,9 @@ class LoadCSV(Loader):
     files: Dict[str, str]
     chunksize: int = 1000
     _cache = InternalField(default_factory=dict)
-    loader_limit: int = None
     streaming: bool = True
     def stream_csv(self, file):
         if self.get_limit() is not None:
@@ -214,7 +215,7 @@ class LoadCSV(Loader):
             chunksize = self.chunksize
         row_count = 0
-        for chunk in pd.read_csv(file, chunksize=chunksize):
             for _, row in chunk.iterrows():
                 if self.get_limit() is not None and row_count >= self.get_limit():
                     return
@@ -225,9 +226,9 @@ class LoadCSV(Loader):
         if file not in self._cache:
             if self.get_limit() is not None:
                 self.log_limited_loading()
-                self._cache[file] = pd.read_csv(file, nrows=self.get_limit()).to_dict(
-                    "records"
-                )
             else:
                 self._cache[file] = pd.read_csv(file).to_dict("records")
@@ -250,11 +251,41 @@ class LoadCSV(Loader):
         )
 class MissingKaggleCredentialsError(ValueError):
     pass
-# TODO write how to obtain kaggle credentials
 class LoadFromKaggle(Loader):
     url: str
     _requirements_list: List[str] = ["opendatasets"]
@@ -375,7 +406,7 @@ class LoadFromIBMCloud(Loader):
         local_dir = os.path.join(
             self.cache_dir,
             self.bucket_name,
-            self.data_dir,
             f"loader_limit_{self.get_limit()}",
         )
         if not os.path.exists(local_dir):

     files: Dict[str, str]
     chunksize: int = 1000
     _cache = InternalField(default_factory=dict)
+    loader_limit: Optional[int] = None
     streaming: bool = True
+    sep: str = ","
     def stream_csv(self, file):
         if self.get_limit() is not None:
             chunksize = self.chunksize
         row_count = 0
+        for chunk in pd.read_csv(file, chunksize=chunksize, sep=self.sep):
             for _, row in chunk.iterrows():
                 if self.get_limit() is not None and row_count >= self.get_limit():
                     return
         if file not in self._cache:
             if self.get_limit() is not None:
                 self.log_limited_loading()
+                self._cache[file] = pd.read_csv(
+                    file, nrows=self.get_limit(), sep=self.sep
+                ).to_dict("records")
             else:
                 self._cache[file] = pd.read_csv(file).to_dict("records")
         )
+class LoadFromSklearn(Loader):
+    dataset_name: str
+    splits: List[str] = ["train", "test"]
+    _requirements_list: List[str] = ["sklearn", "pandas"]
+    def verify(self):
+        super().verify()
+        if self.streaming:
+            raise NotImplementedError("LoadFromSklearn cannot load with streaming.")
+    def prepare(self):
+        super().prepare()
+        from sklearn import datasets as sklearn_datatasets
+        self.downloader = getattr(sklearn_datatasets, f"fetch_{self.dataset_name}")
+    def process(self):
+        with TemporaryDirectory() as temp_directory:
+            for split in self.splits:
+                split_data = self.downloader(subset=split)
+                targets = [split_data["target_names"][t] for t in split_data["target"]]
+                df = pd.DataFrame([split_data["data"], targets]).T
+                df.columns = ["data", "target"]
+                df.to_csv(os.path.join(temp_directory, f"{split}.csv"), index=None)
+            dataset = hf_load_dataset(temp_directory, streaming=False)
+        return MultiStream.from_iterables(dataset)
 class MissingKaggleCredentialsError(ValueError):
     pass
 class LoadFromKaggle(Loader):
     url: str
     _requirements_list: List[str] = ["opendatasets"]
         local_dir = os.path.join(
             self.cache_dir,
             self.bucket_name,
+            self.data_dir or "",  # data_dir can be None
             f"loader_limit_{self.get_limit()}",
         )
         if not os.path.exists(local_dir):