mindseye-lab-space / src /dataset_client.py
PeacebinfLow's picture
Update src/dataset_client.py
cf6c27e verified
import os
import requests
from typing import Dict, Optional
class DatasetAPIClient:
"""Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN)."""
BASE_URL = "https://datasets-server.huggingface.co"
def __init__(
self,
dataset: str = "PeacebinfLow/mindseye-lab-ledger",
config: str = "default",
timeout: int = 20,
token: Optional[str] = None,
):
self.dataset = dataset
self.config = config
self.timeout = timeout
# Prefer explicit token arg, otherwise env
self.token = (
token
or os.getenv("HF_TOKEN")
or os.getenv("HUGGINGFACEHUB_API_TOKEN")
or os.getenv("HUGGINGFACE_TOKEN")
)
self.session = requests.Session()
self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
# Attach auth header if token exists
if self.token:
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
def _get(self, path: str, params: Dict) -> Dict:
url = f"{self.BASE_URL}{path}"
try:
r = self.session.get(url, params=params, timeout=self.timeout)
# If dataset is gated/private, you may see 401/403/404-ish responses
# Return diagnostic info to surface the real cause in the UI.
if not r.ok:
return {
"error": "datasets-server request failed",
"status_code": r.status_code,
"url": r.url,
"response_text": (r.text[:2000] if r.text else ""),
"hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.",
}
return r.json()
except Exception as e:
return {
"error": str(e),
"url": url,
"params": params,
"hint": "Network or timeout issue. Try again or increase timeout.",
}
def list_splits(self) -> Dict:
return self._get("/splits", {"dataset": self.dataset})
def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
# NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions,
# but we keep it here for your app logic.
return self._get(
"/first-rows",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
},
)
def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
return self._get(
"/rows",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"offset": offset,
"length": length,
},
)
def search_dataset(self, query: str, split: str = "train") -> Dict:
return self._get(
"/search",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"query": query,
},
)
def filter_dataset(self, where: str, split: str = "train") -> Dict:
return self._get(
"/filter",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"where": where,
},
)