Spaces:

PeacebinfLow
/

mindseye-lab-space

Sleeping

App Files Files Community

PeacebinfLow commited on Jan 10

Commit

63f2753

verified ·

1 Parent(s): 5ca6b08

Update src/dataset_client.py

Browse files

Files changed (1) hide show

src/dataset_client.py +75 -32

src/dataset_client.py CHANGED Viewed

@@ -1,50 +1,93 @@
 import requests
 from typing import Dict, List, Optional
 class DatasetAPIClient:
-    """
-    Lightweight wrapper around HuggingFace Datasets Server API.
-    Docs-ish endpoints:
-      /splits, /first-rows, /rows, /search, /filter
-    """
     BASE_URL = "https://datasets-server.huggingface.co"
-    def __init__(self, dataset: str):
         self.dataset = dataset
-    def list_splits(self) -> List[str]:
         url = f"{self.BASE_URL}/splits"
         params = {"dataset": self.dataset}
-        r = requests.get(url, params=params, timeout=30)
-        r.raise_for_status()
-        data = r.json()
-        splits = data.get("splits", [])
-        return [s.get("split") for s in splits if "split" in s]
-    def get_first_rows(self, split: str = "train", limit: int = 100, config: str = "default") -> Dict:
         url = f"{self.BASE_URL}/first-rows"
-        params = {"dataset": self.dataset, "config": config, "split": split}
-        r = requests.get(url, params=params, timeout=30)
-        r.raise_for_status()
-        return r.json()
-    def get_rows(self, split: str = "train", offset: int = 0, length: int = 25, config: str = "default") -> Dict:
         url = f"{self.BASE_URL}/rows"
-        params = {"dataset": self.dataset, "config": config, "split": split, "offset": offset, "length": length}
-        r = requests.get(url, params=params, timeout=30)
-        r.raise_for_status()
-        return r.json()
-    def search_dataset(self, query: str, split: str = "train", config: str = "default") -> Dict:
         url = f"{self.BASE_URL}/search"
-        params = {"dataset": self.dataset, "config": config, "split": split, "query": query}
-        r = requests.get(url, params=params, timeout=30)
-        r.raise_for_status()
-        return r.json()
-    def filter_dataset(self, where: str, split: str = "train", config: str = "default") -> Dict:
         url = f"{self.BASE_URL}/filter"
-        params = {"dataset": self.dataset, "config": config, "split": split, "where": where}
-        r = requests.get(url, params=params, timeout=30)
-        r.raise_for_status()
-        return r.json()

 import requests
 from typing import Dict, List, Optional
 class DatasetAPIClient:
+    """Client for HuggingFace Datasets Server API"""
     BASE_URL = "https://datasets-server.huggingface.co"
+    def __init__(
+        self,
+        dataset: str = "PeacebinfLow/mindseye-lab-ledger",
+        config: str = "default",
+        timeout: int = 15,
+    ):
         self.dataset = dataset
+        self.config = config
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
+    def list_splits(self) -> Dict:
         url = f"{self.BASE_URL}/splits"
         params = {"dataset": self.dataset}
+        try:
+            r = self.session.get(url, params=params, timeout=self.timeout)
+            r.raise_for_status()
+            data = r.json()
+            return {
+                "splits": [s.get("split") for s in data.get("splits", []) if s.get("split")],
+                "raw": data,
+            }
+        except Exception as e:
+            return {"error": str(e), "splits": []}
+    def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
         url = f"{self.BASE_URL}/first-rows"
+        params = {"dataset": self.dataset, "config": self.config, "split": split}
+        try:
+            r = self.session.get(url, params=params, timeout=self.timeout)
+            r.raise_for_status()
+            data = r.json()
+            # datasets-server controls limit in response; keep stable output
+            return data
+        except Exception as e:
+            return {"error": str(e), "rows": []}
+    def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
         url = f"{self.BASE_URL}/rows"
+        params = {
+            "dataset": self.dataset,
+            "config": self.config,
+            "split": split,
+            "offset": offset,
+            "length": length,
+        }
+        try:
+            r = self.session.get(url, params=params, timeout=self.timeout)
+            r.raise_for_status()
+            return r.json()
+        except Exception as e:
+            return {"error": str(e), "rows": []}
+    def search_dataset(self, query: str, split: str = "train") -> Dict:
         url = f"{self.BASE_URL}/search"
+        params = {
+            "dataset": self.dataset,
+            "config": self.config,
+            "split": split,
+            "query": query,
+        }
+        try:
+            r = self.session.get(url, params=params, timeout=self.timeout)
+            r.raise_for_status()
+            return r.json()
+        except Exception as e:
+            return {"error": str(e), "rows": []}
+    def filter_dataset(self, where: str, split: str = "train") -> Dict:
         url = f"{self.BASE_URL}/filter"
+        params = {
+            "dataset": self.dataset,
+            "config": self.config,
+            "split": split,
+            "where": where,
+        }
+        try:
+            r = self.session.get(url, params=params, timeout=self.timeout)
+            r.raise_for_status()
+            return r.json()
+        except Exception as e:
+            return {"error": str(e), "rows": []}