Spaces:
Sleeping
Sleeping
File size: 3,553 Bytes
cf6c27e c17899d cf6c27e c17899d 63f2753 c17899d cf6c27e 63f2753 c17899d 63f2753 cf6c27e 63f2753 c17899d 63f2753 cf6c27e 63f2753 c17899d cf6c27e 63f2753 cf6c27e 63f2753 cf6c27e 63f2753 cf6c27e c17899d 63f2753 cf6c27e c17899d 63f2753 cf6c27e c17899d 63f2753 cf6c27e c17899d 63f2753 cf6c27e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | import os
import requests
from typing import Dict, Optional
class DatasetAPIClient:
"""Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN)."""
BASE_URL = "https://datasets-server.huggingface.co"
def __init__(
self,
dataset: str = "PeacebinfLow/mindseye-lab-ledger",
config: str = "default",
timeout: int = 20,
token: Optional[str] = None,
):
self.dataset = dataset
self.config = config
self.timeout = timeout
# Prefer explicit token arg, otherwise env
self.token = (
token
or os.getenv("HF_TOKEN")
or os.getenv("HUGGINGFACEHUB_API_TOKEN")
or os.getenv("HUGGINGFACE_TOKEN")
)
self.session = requests.Session()
self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})
# Attach auth header if token exists
if self.token:
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
def _get(self, path: str, params: Dict) -> Dict:
url = f"{self.BASE_URL}{path}"
try:
r = self.session.get(url, params=params, timeout=self.timeout)
# If dataset is gated/private, you may see 401/403/404-ish responses
# Return diagnostic info to surface the real cause in the UI.
if not r.ok:
return {
"error": "datasets-server request failed",
"status_code": r.status_code,
"url": r.url,
"response_text": (r.text[:2000] if r.text else ""),
"hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.",
}
return r.json()
except Exception as e:
return {
"error": str(e),
"url": url,
"params": params,
"hint": "Network or timeout issue. Try again or increase timeout.",
}
def list_splits(self) -> Dict:
return self._get("/splits", {"dataset": self.dataset})
def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
# NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions,
# but we keep it here for your app logic.
return self._get(
"/first-rows",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
},
)
def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
return self._get(
"/rows",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"offset": offset,
"length": length,
},
)
def search_dataset(self, query: str, split: str = "train") -> Dict:
return self._get(
"/search",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"query": query,
},
)
def filter_dataset(self, where: str, split: str = "train") -> Dict:
return self._get(
"/filter",
{
"dataset": self.dataset,
"config": self.config,
"split": split,
"where": where,
},
)
|