PeacebinfLow commited on
Commit
c17899d
·
verified ·
1 Parent(s): 02290e3

Create dataset_client.py

Browse files
Files changed (1) hide show
  1. src/dataset_client.py +50 -0
src/dataset_client.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from typing import Dict, List, Optional
3
+
4
+ class DatasetAPIClient:
5
+ """
6
+ Lightweight wrapper around HuggingFace Datasets Server API.
7
+ Docs-ish endpoints:
8
+ /splits, /first-rows, /rows, /search, /filter
9
+ """
10
+ BASE_URL = "https://datasets-server.huggingface.co"
11
+
12
+ def __init__(self, dataset: str):
13
+ self.dataset = dataset
14
+
15
+ def list_splits(self) -> List[str]:
16
+ url = f"{self.BASE_URL}/splits"
17
+ params = {"dataset": self.dataset}
18
+ r = requests.get(url, params=params, timeout=30)
19
+ r.raise_for_status()
20
+ data = r.json()
21
+ splits = data.get("splits", [])
22
+ return [s.get("split") for s in splits if "split" in s]
23
+
24
+ def get_first_rows(self, split: str = "train", limit: int = 100, config: str = "default") -> Dict:
25
+ url = f"{self.BASE_URL}/first-rows"
26
+ params = {"dataset": self.dataset, "config": config, "split": split}
27
+ r = requests.get(url, params=params, timeout=30)
28
+ r.raise_for_status()
29
+ return r.json()
30
+
31
+ def get_rows(self, split: str = "train", offset: int = 0, length: int = 25, config: str = "default") -> Dict:
32
+ url = f"{self.BASE_URL}/rows"
33
+ params = {"dataset": self.dataset, "config": config, "split": split, "offset": offset, "length": length}
34
+ r = requests.get(url, params=params, timeout=30)
35
+ r.raise_for_status()
36
+ return r.json()
37
+
38
+ def search_dataset(self, query: str, split: str = "train", config: str = "default") -> Dict:
39
+ url = f"{self.BASE_URL}/search"
40
+ params = {"dataset": self.dataset, "config": config, "split": split, "query": query}
41
+ r = requests.get(url, params=params, timeout=30)
42
+ r.raise_for_status()
43
+ return r.json()
44
+
45
+ def filter_dataset(self, where: str, split: str = "train", config: str = "default") -> Dict:
46
+ url = f"{self.BASE_URL}/filter"
47
+ params = {"dataset": self.dataset, "config": config, "split": split, "where": where}
48
+ r = requests.get(url, params=params, timeout=30)
49
+ r.raise_for_status()
50
+ return r.json()