KhalilGuetari commited on
Commit
c7dd7b8
·
1 Parent(s): 11df203

Implement client and dataset service

Browse files
.kiro/specs/hf-eda-mcp-server/tasks.md CHANGED
@@ -7,13 +7,13 @@
7
  - _Requirements: 3.1, 4.1, 4.2_
8
 
9
  - [ ] 2. Implement HuggingFace integration layer
10
- - [ ] 2.1 Create HuggingFace client wrapper
11
  - Write HfClient class to handle authentication and API interactions
12
  - Implement dataset info retrieval using huggingface_hub
13
  - Add error handling for authentication and network issues
14
  - _Requirements: 1.2, 4.3_
15
 
16
- - [ ] 2.2 Implement dataset service with caching
17
  - Create DatasetService class for centralized dataset operations
18
  - Add metadata caching to reduce API calls
19
  - Implement dataset loading and sampling functionality
 
7
  - _Requirements: 3.1, 4.1, 4.2_
8
 
9
  - [ ] 2. Implement HuggingFace integration layer
10
+ - [x] 2.1 Create HuggingFace client wrapper
11
  - Write HfClient class to handle authentication and API interactions
12
  - Implement dataset info retrieval using huggingface_hub
13
  - Add error handling for authentication and network issues
14
  - _Requirements: 1.2, 4.3_
15
 
16
+ - [x] 2.2 Implement dataset service with caching
17
  - Create DatasetService class for centralized dataset operations
18
  - Add metadata caching to reduce API calls
19
  - Implement dataset loading and sampling functionality
.kiro/steering/tech.md CHANGED
@@ -30,6 +30,13 @@ ruff check .
30
  ruff format .
31
  ```
32
 
 
 
 
 
 
 
 
33
  ## MCP Integration
34
  - Designed to run as an MCP server
35
  - Provides tools accessible to MCP-compatible AI systems
 
30
  ruff format .
31
  ```
32
 
33
+ Use pdm when tests or scripts need to be ran once they are defined in pyproject.tomml
34
+
35
+ ```bash
36
+ # Example to run server
37
+ pdm run hf-eda-mcp
38
+ ```
39
+
40
  ## MCP Integration
41
  - Designed to run as an MCP server
42
  - Provides tools accessible to MCP-compatible AI systems
pyproject.toml CHANGED
@@ -22,8 +22,9 @@ requires = ["pdm-backend"]
22
  build-backend = "pdm.backend"
23
 
24
 
25
- [project.scripts]
26
- hf-eda-mcp = "hf_eda_mcp.server:launch_server"
 
27
 
28
  [tool.pdm]
29
  distribution = true
 
22
  build-backend = "pdm.backend"
23
 
24
 
25
+ [tool.pdm.scripts]
26
+ hf-eda-mcp = "python -m hf_eda_mcp"
27
+ hf_client_playground = "python -m scripts.playground.hf_client_playground"
28
 
29
  [tool.pdm]
30
  distribution = true
scripts/__init__.py ADDED
File without changes
scripts/playground/__init__.py ADDED
File without changes
scripts/playground/hf_client_playground.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pprint import pprint
3
+ from hf_eda_mcp.integrations.hf_client import HfClient
4
+
5
+ logger = logging.getLogger(__name__)
6
+ logging.basicConfig(
7
+ filename="scripts.log",
8
+ encoding='utf-8',
9
+ level=logging.DEBUG,
10
+ filemode="w",
11
+ format='%(asctime)s - %(levelname)s - %(message)s',
12
+ )
13
+
14
+
15
+ def authenticate():
16
+ client = HfClient()
17
+ client._authenticate()
18
+ return client
19
+
20
+
21
+ def get_dataset_info(client: HfClient, dataset_id: str = "squad"):
22
+ metadata = client.get_dataset_info(dataset_id)
23
+ logger.info("Fetched %s dataset", dataset_id)
24
+ pprint(metadata, indent=4)
25
+
26
+
27
+
28
+ if __name__ == "__main__":
29
+ client = authenticate()
30
+
31
+ get_dataset_info(client=client, dataset_id="nyu-mll/glue")
src/hf_eda_mcp/integrations/hf_client.py CHANGED
@@ -1,7 +1,258 @@
1
  """
2
  HuggingFace client wrapper for API interactions.
3
 
4
- This module will be implemented in task 2.1.
 
5
  """
6
 
7
- # Placeholder - will be implemented in task 2.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  HuggingFace client wrapper for API interactions.
3
 
4
+ This module provides a wrapper around HuggingFace Hub API for dataset operations,
5
+ including authentication, dataset info retrieval, and error handling.
6
  """
7
 
8
+ import logging
9
+ from typing import Optional, Dict, Any, List
10
+ from huggingface_hub import HfApi
11
+ from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
12
+ from requests.exceptions import RequestException, ConnectionError, Timeout
13
+
14
+ logger = logging.getLogger()
15
+
16
+
17
+ class HfClientError(Exception):
18
+ """Base exception for HuggingFace client errors."""
19
+
20
+ pass
21
+
22
+
23
+ class AuthenticationError(HfClientError):
24
+ """Raised when authentication fails."""
25
+
26
+ pass
27
+
28
+
29
+ class DatasetNotFoundError(HfClientError):
30
+ """Raised when a dataset is not found."""
31
+
32
+ pass
33
+
34
+
35
+ class NetworkError(HfClientError):
36
+ """Raised when network operations fail."""
37
+
38
+ pass
39
+
40
+
41
+ class HfClient:
42
+ """
43
+ HuggingFace client wrapper for dataset operations.
44
+
45
+ Handles authentication, dataset info retrieval, and provides
46
+ comprehensive error handling for API interactions.
47
+ """
48
+
49
+ def __init__(self, token: Optional[str] = None):
50
+ """
51
+ Initialize HuggingFace client.
52
+
53
+ Args:
54
+ token: Optional HuggingFace authentication token
55
+ """
56
+ self.token = token
57
+ self.api = HfApi(token=token)
58
+ self._authenticated = False
59
+
60
+ if token:
61
+ self._authenticate()
62
+
63
+ def _authenticate(self) -> None:
64
+ """
65
+ Authenticate with HuggingFace Hub using the provided token.
66
+
67
+ Raises:
68
+ AuthenticationError: If authentication fails
69
+ """
70
+ try:
71
+ # Test authentication by getting user info
72
+ user_info = self.api.whoami()
73
+ self._authenticated = True
74
+ logger.info(
75
+ f"Successfully authenticated as {user_info.get('name', 'unknown')}"
76
+ )
77
+ except Exception as e:
78
+ logger.error(f"Authentication failed: {str(e)}")
79
+ raise AuthenticationError(
80
+ f"Failed to authenticate with HuggingFace Hub: {str(e)}"
81
+ )
82
+
83
+ def get_dataset_info(
84
+ self, dataset_id: str, config_name: Optional[str] = None
85
+ ) -> Dict[str, Any]:
86
+ """
87
+ Retrieve comprehensive dataset information from HuggingFace Hub.
88
+
89
+ Args:
90
+ dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
91
+ config_name: Optional configuration name for multi-config datasets
92
+
93
+ Returns:
94
+ Dictionary containing dataset metadata including:
95
+ - Basic info (size, splits, features)
96
+ - Configuration details
97
+ - Download statistics
98
+ - Dataset card information
99
+
100
+ Raises:
101
+ DatasetNotFoundError: If dataset doesn't exist
102
+ AuthenticationError: If dataset is private and authentication fails
103
+ NetworkError: If network request fails
104
+ """
105
+ try:
106
+ # Get dataset info from HuggingFace Hub
107
+ dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
108
+
109
+ # Format the response
110
+ metadata = {
111
+ "id": dataset_info.id,
112
+ "author": dataset_info.author or "unknown",
113
+ "description": dataset_info.description or "",
114
+ "tags": dataset_info.tags or [],
115
+ "downloads": getattr(dataset_info, "downloads", 0),
116
+ "likes": getattr(dataset_info, "likes", 0),
117
+ "created_at": dataset_info.created_at.isoformat()
118
+ if dataset_info.created_at
119
+ else None,
120
+ "last_modified": dataset_info.last_modified.isoformat()
121
+ if dataset_info.last_modified
122
+ else None,
123
+ "size_bytes": getattr(dataset_info, "size_in_bytes", 0),
124
+ "configs": [],
125
+ "splits": {},
126
+ "features": {},
127
+ }
128
+
129
+ # Extract configuration information
130
+ if hasattr(dataset_info, "card_data") and dataset_info.card_data:
131
+ configs = getattr(dataset_info.card_data, "configs", [])
132
+ if configs:
133
+ # Handle both dict and object configs
134
+ config_names = []
135
+ for config in configs:
136
+ if hasattr(config, "config_name"):
137
+ config_names.append(config.config_name)
138
+ elif isinstance(config, dict) and "config_name" in config:
139
+ config_names.append(config["config_name"])
140
+ metadata["configs"] = config_names
141
+
142
+ # If no configs found in card_data, try to get from siblings
143
+ if not metadata["configs"] and dataset_info.siblings:
144
+ # Look for config files to infer configurations
145
+ config_files = [
146
+ s.rfilename
147
+ for s in dataset_info.siblings
148
+ if s.rfilename.endswith(".json") and "/" in s.rfilename
149
+ ]
150
+ if config_files:
151
+ metadata["configs"] = list(
152
+ set([f.split("/")[0] for f in config_files])
153
+ )
154
+
155
+ # Try to get more detailed info using datasets library approach
156
+ try:
157
+ from datasets import get_dataset_config_names, get_dataset_split_names
158
+
159
+ # Get available configurations
160
+ try:
161
+ config_names = get_dataset_config_names(dataset_id)
162
+ if config_names:
163
+ metadata["configs"] = config_names
164
+ except Exception:
165
+ # If we can't get config names, use what we have
166
+ pass
167
+
168
+ # Get splits for the specified or default configuration
169
+ target_config = config_name or (
170
+ metadata["configs"][0] if metadata["configs"] else None
171
+ )
172
+ if target_config:
173
+ try:
174
+ split_names = get_dataset_split_names(
175
+ dataset_id, config_name=target_config
176
+ )
177
+ metadata["splits"] = {
178
+ split: 0 for split in split_names
179
+ } # Size will be filled later
180
+ except Exception:
181
+ # If we can't get split info, continue without it
182
+ pass
183
+
184
+ except ImportError:
185
+ logger.warning(
186
+ "datasets library not available for detailed config info"
187
+ )
188
+
189
+ return metadata
190
+
191
+ except RepositoryNotFoundError:
192
+ raise DatasetNotFoundError(
193
+ f"Dataset '{dataset_id}' not found on HuggingFace Hub"
194
+ )
195
+ except GatedRepoError:
196
+ raise AuthenticationError(
197
+ f"Dataset '{dataset_id}' is private or gated. "
198
+ "Please provide a valid authentication token or request access."
199
+ )
200
+ except (ConnectionError, Timeout) as e:
201
+ raise NetworkError(f"Network error while fetching dataset info: {str(e)}")
202
+ except RequestException as e:
203
+ raise NetworkError(f"Request failed: {str(e)}")
204
+ except Exception as e:
205
+ logger.error(
206
+ f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
207
+ )
208
+ raise HfClientError(f"Failed to get dataset info: {str(e)}")
209
+
210
+ def list_dataset_configs(self, dataset_id: str) -> List[str]:
211
+ """
212
+ List available configurations for a dataset.
213
+
214
+ Args:
215
+ dataset_id: HuggingFace dataset identifier
216
+
217
+ Returns:
218
+ List of configuration names
219
+
220
+ Raises:
221
+ DatasetNotFoundError: If dataset doesn't exist
222
+ NetworkError: If network request fails
223
+ """
224
+ try:
225
+ from datasets import get_dataset_config_names
226
+
227
+ return get_dataset_config_names(dataset_id)
228
+ except Exception:
229
+ # Fallback to getting info and extracting configs
230
+ dataset_info = self.get_dataset_info(dataset_id)
231
+ return dataset_info.get("configs", [])
232
+
233
+ def validate_dataset_access(
234
+ self, dataset_id: str, config_name: Optional[str] = None
235
+ ) -> bool:
236
+ """
237
+ Validate that a dataset can be accessed with current authentication.
238
+
239
+ Args:
240
+ dataset_id: HuggingFace dataset identifier
241
+ config_name: Optional configuration name
242
+
243
+ Returns:
244
+ True if dataset is accessible, False otherwise
245
+ """
246
+ try:
247
+ self.get_dataset_info(dataset_id, config_name)
248
+ return True
249
+ except (DatasetNotFoundError, AuthenticationError):
250
+ return False
251
+ except Exception:
252
+ # For other errors (network, etc.), assume dataset exists but there's a temporary issue
253
+ return True
254
+
255
+ @property
256
+ def is_authenticated(self) -> bool:
257
+ """Check if client is authenticated."""
258
+ return self._authenticated
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -1,7 +1,357 @@
1
  """
2
  Dataset service for centralized dataset operations and caching.
3
 
4
- This module will be implemented in task 2.2.
 
5
  """
6
 
7
- # Placeholder - will be implemented in task 2.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Dataset service for centralized dataset operations and caching.
3
 
4
+ This module provides a centralized service for dataset operations including
5
+ metadata caching, dataset loading, and sampling functionality.
6
  """
7
 
8
+ import logging
9
+ import os
10
+ import json
11
+ import time
12
+ from typing import Optional, Dict, Any
13
+ from pathlib import Path
14
+ from datasets import load_dataset
15
+ from datasets.utils.logging import disable_progress_bar
16
+
17
+ from hf_eda_mcp.integrations.hf_client import HfClient, HfClientError, DatasetNotFoundError
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Disable datasets progress bars for cleaner logging
22
+ disable_progress_bar()
23
+
24
+
25
+ class DatasetServiceError(Exception):
26
+ """Base exception for dataset service errors."""
27
+ pass
28
+
29
+
30
+ class CacheError(DatasetServiceError):
31
+ """Raised when cache operations fail."""
32
+ pass
33
+
34
+
35
+ class DatasetService:
36
+ """
37
+ Centralized service for dataset operations with caching support.
38
+
39
+ Provides metadata caching, dataset loading, and sampling functionality
40
+ while managing authentication and error handling.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ cache_dir: Optional[str] = None,
46
+ token: Optional[str] = None,
47
+ cache_ttl: int = 3600 # 1 hour default TTL
48
+ ):
49
+ """
50
+ Initialize dataset service with optional caching and authentication.
51
+
52
+ Args:
53
+ cache_dir: Directory for caching metadata and samples
54
+ token: HuggingFace authentication token
55
+ cache_ttl: Cache time-to-live in seconds (default: 1 hour)
56
+ """
57
+ self.hf_client = HfClient(token=token)
58
+ self.cache_ttl = cache_ttl
59
+
60
+ # Set up cache directory
61
+ if cache_dir is None:
62
+ cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "hf_eda_mcp")
63
+
64
+ self.cache_dir = Path(cache_dir)
65
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
66
+
67
+ # Cache subdirectories
68
+ self.metadata_cache_dir = self.cache_dir / "metadata"
69
+ self.sample_cache_dir = self.cache_dir / "samples"
70
+
71
+ self.metadata_cache_dir.mkdir(exist_ok=True)
72
+ self.sample_cache_dir.mkdir(exist_ok=True)
73
+
74
+ logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
75
+
76
+ def _get_cache_key(self, dataset_id: str, config_name: Optional[str] = None) -> str:
77
+ """Generate cache key for dataset metadata."""
78
+ if config_name:
79
+ return f"{dataset_id}_{config_name}".replace("/", "_")
80
+ return dataset_id.replace("/", "_")
81
+
82
+ def _get_sample_cache_key(
83
+ self,
84
+ dataset_id: str,
85
+ split: str,
86
+ num_samples: int,
87
+ config_name: Optional[str] = None
88
+ ) -> str:
89
+ """Generate cache key for dataset samples."""
90
+ base_key = self._get_cache_key(dataset_id, config_name)
91
+ return f"{base_key}_{split}_{num_samples}"
92
+
93
+ def _is_cache_valid(self, cache_file: Path) -> bool:
94
+ """Check if cache file exists and is within TTL."""
95
+ if not cache_file.exists():
96
+ return False
97
+
98
+ # Check if cache is within TTL
99
+ cache_age = time.time() - cache_file.stat().st_mtime
100
+ return cache_age < self.cache_ttl
101
+
102
+ def _save_to_cache(self, cache_file: Path, data: Dict[str, Any]) -> None:
103
+ """Save data to cache file."""
104
+ try:
105
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
106
+ with open(cache_file, 'w', encoding='utf-8') as f:
107
+ json.dump(data, f, indent=2, ensure_ascii=False)
108
+ logger.debug(f"Saved data to cache: {cache_file}")
109
+ except Exception as e:
110
+ logger.warning(f"Failed to save cache file {cache_file}: {e}")
111
+ raise CacheError(f"Failed to save cache: {e}")
112
+
113
+ def _load_from_cache(self, cache_file: Path) -> Optional[Dict[str, Any]]:
114
+ """Load data from cache file."""
115
+ try:
116
+ if not self._is_cache_valid(cache_file):
117
+ return None
118
+
119
+ with open(cache_file, 'r', encoding='utf-8') as f:
120
+ data = json.load(f)
121
+ logger.debug(f"Loaded data from cache: {cache_file}")
122
+ return data
123
+ except Exception as e:
124
+ logger.warning(f"Failed to load cache file {cache_file}: {e}")
125
+ return None
126
+
127
+ def load_dataset_info(self, dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
128
+ """
129
+ Load dataset information from HuggingFace Hub with caching.
130
+
131
+ Args:
132
+ dataset_id: HuggingFace dataset identifier
133
+ config_name: Optional configuration name
134
+
135
+ Returns:
136
+ Dictionary containing dataset metadata
137
+
138
+ Raises:
139
+ DatasetNotFoundError: If dataset doesn't exist
140
+ AuthenticationError: If dataset is private and authentication fails
141
+ """
142
+ cache_key = self._get_cache_key(dataset_id, config_name)
143
+ cache_file = self.metadata_cache_dir / f"{cache_key}.json"
144
+
145
+ # Try to load from cache first
146
+ cached_data = self._load_from_cache(cache_file)
147
+ if cached_data is not None:
148
+ logger.debug(f"Using cached metadata for {dataset_id}")
149
+ return cached_data
150
+
151
+ # Fetch from HuggingFace Hub
152
+ try:
153
+ logger.info(f"Fetching metadata for dataset: {dataset_id}")
154
+ metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
155
+
156
+ # Add cache timestamp
157
+ metadata['_cached_at'] = time.time()
158
+
159
+ # Save to cache
160
+ self._save_to_cache(cache_file, metadata)
161
+
162
+ return metadata
163
+
164
+ except HfClientError:
165
+ # Re-raise HfClient errors as-is
166
+ raise
167
+
168
+ def load_dataset_sample(
169
+ self,
170
+ dataset_id: str,
171
+ split: str = "train",
172
+ num_samples: int = 10,
173
+ config_name: Optional[str] = None,
174
+ streaming: bool = True
175
+ ) -> Dict[str, Any]:
176
+ """
177
+ Load samples from the specified dataset with caching.
178
+
179
+ Args:
180
+ dataset_id: HuggingFace dataset identifier
181
+ split: Dataset split to sample from
182
+ num_samples: Number of samples to retrieve
183
+ config_name: Optional configuration name
184
+ streaming: Whether to use streaming mode for large datasets
185
+
186
+ Returns:
187
+ Dictionary containing sampled data and metadata
188
+
189
+ Raises:
190
+ DatasetNotFoundError: If dataset or split doesn't exist
191
+ DatasetServiceError: If sampling fails
192
+ """
193
+ # For small samples, check cache first
194
+ if num_samples <= 100: # Only cache small samples
195
+ cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
196
+ cache_file = self.sample_cache_dir / f"{cache_key}.json"
197
+
198
+ cached_data = self._load_from_cache(cache_file)
199
+ if cached_data is not None:
200
+ logger.debug(f"Using cached sample for {dataset_id}")
201
+ return cached_data
202
+
203
+ try:
204
+ logger.info(f"Loading sample from dataset: {dataset_id}, split: {split}")
205
+
206
+ # Load dataset with streaming for efficiency
207
+ dataset = load_dataset(
208
+ dataset_id,
209
+ name=config_name,
210
+ split=split,
211
+ streaming=streaming
212
+ )
213
+
214
+ # Take the requested number of samples
215
+ if streaming:
216
+ # For streaming datasets, take samples from iterator
217
+ samples = []
218
+ for i, sample in enumerate(dataset):
219
+ if i >= num_samples:
220
+ break
221
+ samples.append(sample)
222
+ else:
223
+ # For non-streaming datasets, use select
224
+ max_samples = min(num_samples, len(dataset))
225
+ samples = dataset.select(range(max_samples))
226
+ samples = [samples[i] for i in range(len(samples))]
227
+
228
+ # Get dataset info for schema
229
+ dataset_info = self.load_dataset_info(dataset_id, config_name)
230
+
231
+ # Prepare response
232
+ sample_data = {
233
+ 'dataset_id': dataset_id,
234
+ 'config_name': config_name,
235
+ 'split': split,
236
+ 'num_samples': len(samples),
237
+ 'requested_samples': num_samples,
238
+ 'data': samples,
239
+ 'schema': dataset_info.get('features', {}),
240
+ '_sampled_at': time.time()
241
+ }
242
+
243
+ # Cache small samples
244
+ if num_samples <= 100:
245
+ try:
246
+ self._save_to_cache(cache_file, sample_data)
247
+ except CacheError:
248
+ # Don't fail if caching fails
249
+ pass
250
+
251
+ return sample_data
252
+
253
+ except Exception as e:
254
+ logger.error(f"Failed to load dataset sample: {e}")
255
+ if "not found" in str(e).lower():
256
+ raise DatasetNotFoundError(f"Dataset '{dataset_id}' or split '{split}' not found")
257
+ raise DatasetServiceError(f"Failed to load dataset sample: {e}")
258
+
259
+ def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
260
+ """
261
+ Retrieve cached metadata without making API calls.
262
+
263
+ Args:
264
+ dataset_id: HuggingFace dataset identifier
265
+ config_name: Optional configuration name
266
+
267
+ Returns:
268
+ Cached metadata dictionary or None if not cached/expired
269
+ """
270
+ cache_key = self._get_cache_key(dataset_id, config_name)
271
+ cache_file = self.metadata_cache_dir / f"{cache_key}.json"
272
+
273
+ return self._load_from_cache(cache_file)
274
+
275
+ def clear_cache(self, dataset_id: Optional[str] = None) -> None:
276
+ """
277
+ Clear cached data for a specific dataset or all datasets.
278
+
279
+ Args:
280
+ dataset_id: Optional dataset ID to clear. If None, clears all cache.
281
+ """
282
+ try:
283
+ if dataset_id is None:
284
+ # Clear all cache
285
+ for cache_file in self.metadata_cache_dir.glob("*.json"):
286
+ cache_file.unlink()
287
+ for cache_file in self.sample_cache_dir.glob("*.json"):
288
+ cache_file.unlink()
289
+ logger.info("Cleared all cache")
290
+ else:
291
+ # Clear cache for specific dataset
292
+ cache_key = self._get_cache_key(dataset_id)
293
+
294
+ # Clear metadata cache
295
+ for cache_file in self.metadata_cache_dir.glob(f"{cache_key}*.json"):
296
+ cache_file.unlink()
297
+
298
+ # Clear sample cache
299
+ for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
300
+ cache_file.unlink()
301
+
302
+ logger.info(f"Cleared cache for dataset: {dataset_id}")
303
+
304
+ except Exception as e:
305
+ logger.warning(f"Failed to clear cache: {e}")
306
+ raise CacheError(f"Failed to clear cache: {e}")
307
+
308
+ def get_cache_stats(self) -> Dict[str, Any]:
309
+ """
310
+ Get statistics about the current cache.
311
+
312
+ Returns:
313
+ Dictionary with cache statistics
314
+ """
315
+ try:
316
+ metadata_files = list(self.metadata_cache_dir.glob("*.json"))
317
+ sample_files = list(self.sample_cache_dir.glob("*.json"))
318
+
319
+ # Calculate cache sizes
320
+ metadata_size = sum(f.stat().st_size for f in metadata_files)
321
+ sample_size = sum(f.stat().st_size for f in sample_files)
322
+
323
+ return {
324
+ 'cache_dir': str(self.cache_dir),
325
+ 'metadata_files': len(metadata_files),
326
+ 'sample_files': len(sample_files),
327
+ 'total_files': len(metadata_files) + len(sample_files),
328
+ 'metadata_size_bytes': metadata_size,
329
+ 'sample_size_bytes': sample_size,
330
+ 'total_size_bytes': metadata_size + sample_size,
331
+ 'cache_ttl_seconds': self.cache_ttl
332
+ }
333
+ except Exception as e:
334
+ logger.warning(f"Failed to get cache stats: {e}")
335
+ return {'error': str(e)}
336
+
337
+ def validate_dataset_access(
338
+ self,
339
+ dataset_id: str,
340
+ config_name: Optional[str] = None
341
+ ) -> bool:
342
+ """
343
+ Validate that a dataset can be accessed with current authentication.
344
+
345
+ Args:
346
+ dataset_id: HuggingFace dataset identifier
347
+ config_name: Optional configuration name
348
+
349
+ Returns:
350
+ True if dataset is accessible, False otherwise
351
+ """
352
+ return self.hf_client.validate_dataset_access(dataset_id, config_name)
353
+
354
+ @property
355
+ def is_authenticated(self) -> bool:
356
+ """Check if the service is authenticated with HuggingFace."""
357
+ return self.hf_client.is_authenticated