Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from typing import Dict, Optional | |
| class DatasetAPIClient: | |
| """Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN).""" | |
| BASE_URL = "https://datasets-server.huggingface.co" | |
| def __init__( | |
| self, | |
| dataset: str = "PeacebinfLow/mindseye-lab-ledger", | |
| config: str = "default", | |
| timeout: int = 20, | |
| token: Optional[str] = None, | |
| ): | |
| self.dataset = dataset | |
| self.config = config | |
| self.timeout = timeout | |
| # Prefer explicit token arg, otherwise env | |
| self.token = ( | |
| token | |
| or os.getenv("HF_TOKEN") | |
| or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| or os.getenv("HUGGINGFACE_TOKEN") | |
| ) | |
| self.session = requests.Session() | |
| self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"}) | |
| # Attach auth header if token exists | |
| if self.token: | |
| self.session.headers.update({"Authorization": f"Bearer {self.token}"}) | |
| def _get(self, path: str, params: Dict) -> Dict: | |
| url = f"{self.BASE_URL}{path}" | |
| try: | |
| r = self.session.get(url, params=params, timeout=self.timeout) | |
| # If dataset is gated/private, you may see 401/403/404-ish responses | |
| # Return diagnostic info to surface the real cause in the UI. | |
| if not r.ok: | |
| return { | |
| "error": "datasets-server request failed", | |
| "status_code": r.status_code, | |
| "url": r.url, | |
| "response_text": (r.text[:2000] if r.text else ""), | |
| "hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.", | |
| } | |
| return r.json() | |
| except Exception as e: | |
| return { | |
| "error": str(e), | |
| "url": url, | |
| "params": params, | |
| "hint": "Network or timeout issue. Try again or increase timeout.", | |
| } | |
| def list_splits(self) -> Dict: | |
| return self._get("/splits", {"dataset": self.dataset}) | |
| def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict: | |
| # NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions, | |
| # but we keep it here for your app logic. | |
| return self._get( | |
| "/first-rows", | |
| { | |
| "dataset": self.dataset, | |
| "config": self.config, | |
| "split": split, | |
| }, | |
| ) | |
| def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict: | |
| return self._get( | |
| "/rows", | |
| { | |
| "dataset": self.dataset, | |
| "config": self.config, | |
| "split": split, | |
| "offset": offset, | |
| "length": length, | |
| }, | |
| ) | |
| def search_dataset(self, query: str, split: str = "train") -> Dict: | |
| return self._get( | |
| "/search", | |
| { | |
| "dataset": self.dataset, | |
| "config": self.config, | |
| "split": split, | |
| "query": query, | |
| }, | |
| ) | |
| def filter_dataset(self, where: str, split: str = "train") -> Dict: | |
| return self._get( | |
| "/filter", | |
| { | |
| "dataset": self.dataset, | |
| "config": self.config, | |
| "split": split, | |
| "where": where, | |
| }, | |
| ) | |