PeacebinfLow commited on
Commit
cf6c27e
·
verified ·
1 Parent(s): 69bbb9e

Update src/dataset_client.py

Browse files
Files changed (1) hide show
  1. src/dataset_client.py +76 -62
src/dataset_client.py CHANGED
@@ -1,9 +1,10 @@
 
1
  import requests
2
- from typing import Dict, List, Optional
3
 
4
 
5
  class DatasetAPIClient:
6
- """Client for HuggingFace Datasets Server API"""
7
 
8
  BASE_URL = "https://datasets-server.huggingface.co"
9
 
@@ -11,83 +12,96 @@ def __init__(
11
  self,
12
  dataset: str = "PeacebinfLow/mindseye-lab-ledger",
13
  config: str = "default",
14
- timeout: int = 15,
 
15
  ):
16
  self.dataset = dataset
17
  self.config = config
18
  self.timeout = timeout
19
 
 
 
 
 
 
 
 
 
20
  self.session = requests.Session()
21
  self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
22
 
23
- def list_splits(self) -> Dict:
24
- url = f"{self.BASE_URL}/splits"
25
- params = {"dataset": self.dataset}
 
 
 
26
  try:
27
  r = self.session.get(url, params=params, timeout=self.timeout)
28
- r.raise_for_status()
29
- data = r.json()
 
 
 
 
 
 
 
 
 
 
30
  return {
31
- "splits": [s.get("split") for s in data.get("splits", []) if s.get("split")],
32
- "raw": data,
 
 
33
  }
34
- except Exception as e:
35
- return {"error": str(e), "splits": []}
 
36
 
37
  def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
38
- url = f"{self.BASE_URL}/first-rows"
39
- params = {"dataset": self.dataset, "config": self.config, "split": split}
40
- try:
41
- r = self.session.get(url, params=params, timeout=self.timeout)
42
- r.raise_for_status()
43
- data = r.json()
44
- # datasets-server controls limit in response; keep stable output
45
- return data
46
- except Exception as e:
47
- return {"error": str(e), "rows": []}
48
 
49
  def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
50
- url = f"{self.BASE_URL}/rows"
51
- params = {
52
- "dataset": self.dataset,
53
- "config": self.config,
54
- "split": split,
55
- "offset": offset,
56
- "length": length,
57
- }
58
- try:
59
- r = self.session.get(url, params=params, timeout=self.timeout)
60
- r.raise_for_status()
61
- return r.json()
62
- except Exception as e:
63
- return {"error": str(e), "rows": []}
64
 
65
  def search_dataset(self, query: str, split: str = "train") -> Dict:
66
- url = f"{self.BASE_URL}/search"
67
- params = {
68
- "dataset": self.dataset,
69
- "config": self.config,
70
- "split": split,
71
- "query": query,
72
- }
73
- try:
74
- r = self.session.get(url, params=params, timeout=self.timeout)
75
- r.raise_for_status()
76
- return r.json()
77
- except Exception as e:
78
- return {"error": str(e), "rows": []}
79
 
80
  def filter_dataset(self, where: str, split: str = "train") -> Dict:
81
- url = f"{self.BASE_URL}/filter"
82
- params = {
83
- "dataset": self.dataset,
84
- "config": self.config,
85
- "split": split,
86
- "where": where,
87
- }
88
- try:
89
- r = self.session.get(url, params=params, timeout=self.timeout)
90
- r.raise_for_status()
91
- return r.json()
92
- except Exception as e:
93
- return {"error": str(e), "rows": []}
 
1
+ import os
2
  import requests
3
+ from typing import Dict, Optional
4
 
5
 
6
  class DatasetAPIClient:
7
+ """Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN)."""
8
 
9
  BASE_URL = "https://datasets-server.huggingface.co"
10
 
 
12
  self,
13
  dataset: str = "PeacebinfLow/mindseye-lab-ledger",
14
  config: str = "default",
15
+ timeout: int = 20,
16
+ token: Optional[str] = None,
17
  ):
18
  self.dataset = dataset
19
  self.config = config
20
  self.timeout = timeout
21
 
22
+ # Prefer explicit token arg, otherwise env
23
+ self.token = (
24
+ token
25
+ or os.getenv("HF_TOKEN")
26
+ or os.getenv("HUGGINGFACEHUB_API_TOKEN")
27
+ or os.getenv("HUGGINGFACE_TOKEN")
28
+ )
29
+
30
  self.session = requests.Session()
31
  self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
32
 
33
+ # Attach auth header if token exists
34
+ if self.token:
35
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
36
+
37
+ def _get(self, path: str, params: Dict) -> Dict:
38
+ url = f"{self.BASE_URL}{path}"
39
  try:
40
  r = self.session.get(url, params=params, timeout=self.timeout)
41
+ # If dataset is gated/private, you may see 401/403/404-ish responses
42
+ # Return diagnostic info to surface the real cause in the UI.
43
+ if not r.ok:
44
+ return {
45
+ "error": "datasets-server request failed",
46
+ "status_code": r.status_code,
47
+ "url": r.url,
48
+ "response_text": (r.text[:2000] if r.text else ""),
49
+ "hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.",
50
+ }
51
+ return r.json()
52
+ except Exception as e:
53
  return {
54
+ "error": str(e),
55
+ "url": url,
56
+ "params": params,
57
+ "hint": "Network or timeout issue. Try again or increase timeout.",
58
  }
59
+
60
+ def list_splits(self) -> Dict:
61
+ return self._get("/splits", {"dataset": self.dataset})
62
 
63
  def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
64
+ # NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions,
65
+ # but we keep it here for your app logic.
66
+ return self._get(
67
+ "/first-rows",
68
+ {
69
+ "dataset": self.dataset,
70
+ "config": self.config,
71
+ "split": split,
72
+ },
73
+ )
74
 
75
  def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
76
+ return self._get(
77
+ "/rows",
78
+ {
79
+ "dataset": self.dataset,
80
+ "config": self.config,
81
+ "split": split,
82
+ "offset": offset,
83
+ "length": length,
84
+ },
85
+ )
 
 
 
 
86
 
87
  def search_dataset(self, query: str, split: str = "train") -> Dict:
88
+ return self._get(
89
+ "/search",
90
+ {
91
+ "dataset": self.dataset,
92
+ "config": self.config,
93
+ "split": split,
94
+ "query": query,
95
+ },
96
+ )
 
 
 
 
97
 
98
  def filter_dataset(self, where: str, split: str = "train") -> Dict:
99
+ return self._get(
100
+ "/filter",
101
+ {
102
+ "dataset": self.dataset,
103
+ "config": self.config,
104
+ "split": split,
105
+ "where": where,
106
+ },
107
+ )