Spaces:

PeacebinfLow
/

mindseye-lab-space

Sleeping

App Files Files Community

mindseye-lab-space / src /dataset_client.py

PeacebinfLow

Update src/dataset_client.py

cf6c27e verified about 1 month ago

raw

history blame contribute delete

3.55 kB

	import os
	import requests
	from typing import Dict, Optional


	class DatasetAPIClient:
	"""Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN)."""

	BASE_URL = "https://datasets-server.huggingface.co"

	def __init__(
	self,
	dataset: str = "PeacebinfLow/mindseye-lab-ledger",
	config: str = "default",
	timeout: int = 20,
	token: Optional[str] = None,
	):
	self.dataset = dataset
	self.config = config
	self.timeout = timeout

	# Prefer explicit token arg, otherwise env
	self.token = (
	token
	or os.getenv("HF_TOKEN")
	or os.getenv("HUGGINGFACEHUB_API_TOKEN")
	or os.getenv("HUGGINGFACE_TOKEN")
	)

	self.session = requests.Session()
	self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})

	# Attach auth header if token exists
	if self.token:
	self.session.headers.update({"Authorization": f"Bearer {self.token}"})

	def _get(self, path: str, params: Dict) -> Dict:
	url = f"{self.BASE_URL}{path}"
	try:
	r = self.session.get(url, params=params, timeout=self.timeout)
	# If dataset is gated/private, you may see 401/403/404-ish responses
	# Return diagnostic info to surface the real cause in the UI.
	if not r.ok:
	return {
	"error": "datasets-server request failed",
	"status_code": r.status_code,
	"url": r.url,
	"response_text": (r.text[:2000] if r.text else ""),
	"hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.",
	}
	return r.json()
	except Exception as e:
	return {
	"error": str(e),
	"url": url,
	"params": params,
	"hint": "Network or timeout issue. Try again or increase timeout.",
	}

	def list_splits(self) -> Dict:
	return self._get("/splits", {"dataset": self.dataset})

	def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
	# NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions,
	# but we keep it here for your app logic.
	return self._get(
	"/first-rows",
	{
	"dataset": self.dataset,
	"config": self.config,
	"split": split,
	},
	)

	def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
	return self._get(
	"/rows",
	{
	"dataset": self.dataset,
	"config": self.config,
	"split": split,
	"offset": offset,
	"length": length,
	},
	)

	def search_dataset(self, query: str, split: str = "train") -> Dict:
	return self._get(
	"/search",
	{
	"dataset": self.dataset,
	"config": self.config,
	"split": split,
	"query": query,
	},
	)

	def filter_dataset(self, where: str, split: str = "train") -> Dict:
	return self._get(
	"/filter",
	{
	"dataset": self.dataset,
	"config": self.config,
	"split": split,
	"where": where,
	},
	)