import os import requests from typing import Dict, Optional class DatasetAPIClient: """Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN).""" BASE_URL = "https://datasets-server.huggingface.co" def __init__( self, dataset: str = "PeacebinfLow/mindseye-lab-ledger", config: str = "default", timeout: int = 20, token: Optional[str] = None, ): self.dataset = dataset self.config = config self.timeout = timeout # Prefer explicit token arg, otherwise env self.token = ( token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") ) self.session = requests.Session() self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"}) # Attach auth header if token exists if self.token: self.session.headers.update({"Authorization": f"Bearer {self.token}"}) def _get(self, path: str, params: Dict) -> Dict: url = f"{self.BASE_URL}{path}" try: r = self.session.get(url, params=params, timeout=self.timeout) # If dataset is gated/private, you may see 401/403/404-ish responses # Return diagnostic info to surface the real cause in the UI. if not r.ok: return { "error": "datasets-server request failed", "status_code": r.status_code, "url": r.url, "response_text": (r.text[:2000] if r.text else ""), "hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.", } return r.json() except Exception as e: return { "error": str(e), "url": url, "params": params, "hint": "Network or timeout issue. Try again or increase timeout.", } def list_splits(self) -> Dict: return self._get("/splits", {"dataset": self.dataset}) def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict: # NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions, # but we keep it here for your app logic. return self._get( "/first-rows", { "dataset": self.dataset, "config": self.config, "split": split, }, ) def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict: return self._get( "/rows", { "dataset": self.dataset, "config": self.config, "split": split, "offset": offset, "length": length, }, ) def search_dataset(self, query: str, split: str = "train") -> Dict: return self._get( "/search", { "dataset": self.dataset, "config": self.config, "split": split, "query": query, }, ) def filter_dataset(self, where: str, split: str = "train") -> Dict: return self._get( "/filter", { "dataset": self.dataset, "config": self.config, "split": split, "where": where, }, )