PeacebinfLow commited on
Commit
63f2753
·
verified ·
1 Parent(s): 5ca6b08

Update src/dataset_client.py

Browse files
Files changed (1) hide show
  1. src/dataset_client.py +75 -32
src/dataset_client.py CHANGED
@@ -1,50 +1,93 @@
1
  import requests
2
  from typing import Dict, List, Optional
3
 
 
4
  class DatasetAPIClient:
5
- """
6
- Lightweight wrapper around HuggingFace Datasets Server API.
7
- Docs-ish endpoints:
8
- /splits, /first-rows, /rows, /search, /filter
9
- """
10
  BASE_URL = "https://datasets-server.huggingface.co"
11
 
12
- def __init__(self, dataset: str):
 
 
 
 
 
13
  self.dataset = dataset
 
 
 
 
 
14
 
15
- def list_splits(self) -> List[str]:
16
  url = f"{self.BASE_URL}/splits"
17
  params = {"dataset": self.dataset}
18
- r = requests.get(url, params=params, timeout=30)
19
- r.raise_for_status()
20
- data = r.json()
21
- splits = data.get("splits", [])
22
- return [s.get("split") for s in splits if "split" in s]
 
 
 
 
 
23
 
24
- def get_first_rows(self, split: str = "train", limit: int = 100, config: str = "default") -> Dict:
25
  url = f"{self.BASE_URL}/first-rows"
26
- params = {"dataset": self.dataset, "config": config, "split": split}
27
- r = requests.get(url, params=params, timeout=30)
28
- r.raise_for_status()
29
- return r.json()
 
 
 
 
 
30
 
31
- def get_rows(self, split: str = "train", offset: int = 0, length: int = 25, config: str = "default") -> Dict:
32
  url = f"{self.BASE_URL}/rows"
33
- params = {"dataset": self.dataset, "config": config, "split": split, "offset": offset, "length": length}
34
- r = requests.get(url, params=params, timeout=30)
35
- r.raise_for_status()
36
- return r.json()
 
 
 
 
 
 
 
 
 
37
 
38
- def search_dataset(self, query: str, split: str = "train", config: str = "default") -> Dict:
39
  url = f"{self.BASE_URL}/search"
40
- params = {"dataset": self.dataset, "config": config, "split": split, "query": query}
41
- r = requests.get(url, params=params, timeout=30)
42
- r.raise_for_status()
43
- return r.json()
 
 
 
 
 
 
 
 
44
 
45
- def filter_dataset(self, where: str, split: str = "train", config: str = "default") -> Dict:
46
  url = f"{self.BASE_URL}/filter"
47
- params = {"dataset": self.dataset, "config": config, "split": split, "where": where}
48
- r = requests.get(url, params=params, timeout=30)
49
- r.raise_for_status()
50
- return r.json()
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from typing import Dict, List, Optional
3
 
4
+
5
  class DatasetAPIClient:
6
+ """Client for HuggingFace Datasets Server API"""
7
+
 
 
 
8
  BASE_URL = "https://datasets-server.huggingface.co"
9
 
10
+ def __init__(
11
+ self,
12
+ dataset: str = "PeacebinfLow/mindseye-lab-ledger",
13
+ config: str = "default",
14
+ timeout: int = 15,
15
+ ):
16
  self.dataset = dataset
17
+ self.config = config
18
+ self.timeout = timeout
19
+
20
+ self.session = requests.Session()
21
+ self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
22
 
23
+ def list_splits(self) -> Dict:
24
  url = f"{self.BASE_URL}/splits"
25
  params = {"dataset": self.dataset}
26
+ try:
27
+ r = self.session.get(url, params=params, timeout=self.timeout)
28
+ r.raise_for_status()
29
+ data = r.json()
30
+ return {
31
+ "splits": [s.get("split") for s in data.get("splits", []) if s.get("split")],
32
+ "raw": data,
33
+ }
34
+ except Exception as e:
35
+ return {"error": str(e), "splits": []}
36
 
37
+ def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
38
  url = f"{self.BASE_URL}/first-rows"
39
+ params = {"dataset": self.dataset, "config": self.config, "split": split}
40
+ try:
41
+ r = self.session.get(url, params=params, timeout=self.timeout)
42
+ r.raise_for_status()
43
+ data = r.json()
44
+ # datasets-server controls limit in response; keep stable output
45
+ return data
46
+ except Exception as e:
47
+ return {"error": str(e), "rows": []}
48
 
49
+ def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
50
  url = f"{self.BASE_URL}/rows"
51
+ params = {
52
+ "dataset": self.dataset,
53
+ "config": self.config,
54
+ "split": split,
55
+ "offset": offset,
56
+ "length": length,
57
+ }
58
+ try:
59
+ r = self.session.get(url, params=params, timeout=self.timeout)
60
+ r.raise_for_status()
61
+ return r.json()
62
+ except Exception as e:
63
+ return {"error": str(e), "rows": []}
64
 
65
+ def search_dataset(self, query: str, split: str = "train") -> Dict:
66
  url = f"{self.BASE_URL}/search"
67
+ params = {
68
+ "dataset": self.dataset,
69
+ "config": self.config,
70
+ "split": split,
71
+ "query": query,
72
+ }
73
+ try:
74
+ r = self.session.get(url, params=params, timeout=self.timeout)
75
+ r.raise_for_status()
76
+ return r.json()
77
+ except Exception as e:
78
+ return {"error": str(e), "rows": []}
79
 
80
+ def filter_dataset(self, where: str, split: str = "train") -> Dict:
81
  url = f"{self.BASE_URL}/filter"
82
+ params = {
83
+ "dataset": self.dataset,
84
+ "config": self.config,
85
+ "split": split,
86
+ "where": where,
87
+ }
88
+ try:
89
+ r = self.session.get(url, params=params, timeout=self.timeout)
90
+ r.raise_for_status()
91
+ return r.json()
92
+ except Exception as e:
93
+ return {"error": str(e), "rows": []}