File size: 3,553 Bytes
cf6c27e
c17899d
cf6c27e
c17899d
63f2753
c17899d
cf6c27e
63f2753
c17899d
 
63f2753
 
 
 
cf6c27e
 
63f2753
c17899d
63f2753
 
 
cf6c27e
 
 
 
 
 
 
 
63f2753
 
c17899d
cf6c27e
 
 
 
 
 
63f2753
 
cf6c27e
 
 
 
 
 
 
 
 
 
 
 
63f2753
cf6c27e
 
 
 
63f2753
cf6c27e
 
 
c17899d
63f2753
cf6c27e
 
 
 
 
 
 
 
 
 
c17899d
63f2753
cf6c27e
 
 
 
 
 
 
 
 
 
c17899d
63f2753
cf6c27e
 
 
 
 
 
 
 
 
c17899d
63f2753
cf6c27e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import requests
from typing import Dict, Optional


class DatasetAPIClient:
    """Client for HuggingFace Datasets Server API (supports gated datasets via HF_TOKEN)."""

    BASE_URL = "https://datasets-server.huggingface.co"

    def __init__(
        self,
        dataset: str = "PeacebinfLow/mindseye-lab-ledger",
        config: str = "default",
        timeout: int = 20,
        token: Optional[str] = None,
    ):
        self.dataset = dataset
        self.config = config
        self.timeout = timeout

        # Prefer explicit token arg, otherwise env
        self.token = (
            token
            or os.getenv("HF_TOKEN")
            or os.getenv("HUGGINGFACEHUB_API_TOKEN")
            or os.getenv("HUGGINGFACE_TOKEN")
        )

        self.session = requests.Session()
        self.session.headers.update({"User-Agent": "MindsEye-Lab/1.0"})

        # Attach auth header if token exists
        if self.token:
            self.session.headers.update({"Authorization": f"Bearer {self.token}"})

    def _get(self, path: str, params: Dict) -> Dict:
        url = f"{self.BASE_URL}{path}"
        try:
            r = self.session.get(url, params=params, timeout=self.timeout)
            # If dataset is gated/private, you may see 401/403/404-ish responses
            # Return diagnostic info to surface the real cause in the UI.
            if not r.ok:
                return {
                    "error": "datasets-server request failed",
                    "status_code": r.status_code,
                    "url": r.url,
                    "response_text": (r.text[:2000] if r.text else ""),
                    "hint": "If this dataset is gated/private, set HF_TOKEN in Space Secrets and ensure your account has access.",
                }
            return r.json()
        except Exception as e:
            return {
                "error": str(e),
                "url": url,
                "params": params,
                "hint": "Network or timeout issue. Try again or increase timeout.",
            }

    def list_splits(self) -> Dict:
        return self._get("/splits", {"dataset": self.dataset})

    def get_first_rows(self, split: str = "train", limit: int = 100) -> Dict:
        # NOTE: datasets-server first-rows doesn't use 'limit' param the same way across versions,
        # but we keep it here for your app logic.
        return self._get(
            "/first-rows",
            {
                "dataset": self.dataset,
                "config": self.config,
                "split": split,
            },
        )

    def get_rows(self, split: str = "train", offset: int = 0, length: int = 25) -> Dict:
        return self._get(
            "/rows",
            {
                "dataset": self.dataset,
                "config": self.config,
                "split": split,
                "offset": offset,
                "length": length,
            },
        )

    def search_dataset(self, query: str, split: str = "train") -> Dict:
        return self._get(
            "/search",
            {
                "dataset": self.dataset,
                "config": self.config,
                "split": split,
                "query": query,
            },
        )

    def filter_dataset(self, where: str, split: str = "train") -> Dict:
        return self._get(
            "/filter",
            {
                "dataset": self.dataset,
                "config": self.config,
                "split": split,
                "where": where,
            },
        )