KhalilGuetari commited on
Commit
b3aa246
·
1 Parent(s): 3ef1838

Improve metadata tool with config detailed information

Browse files
pyproject.toml CHANGED
@@ -40,8 +40,7 @@ where = ["src"]
40
  [tool.pdm.scripts]
41
  hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
42
  hf_client_playground = "python -m scripts.playground.hf_client_playground"
43
- metadata_playground = "python -m scripts.playground.metadata_playground"
44
- test_merged_metadata = "python -m scripts.playground.test_merged_metadata"
45
 
46
  [tool.pdm]
47
  distribution = true
 
40
  [tool.pdm.scripts]
41
  hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
42
  hf_client_playground = "python -m scripts.playground.hf_client_playground"
43
+ metadata_playground = "python -m scripts.playground.metadata_tool_playground"
 
44
 
45
  [tool.pdm]
46
  distribution = true
scripts/playground/metadata_tool_playground.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script to verify the merged metadata from Dataset Service.
3
+
4
+ This script tests that the DatasetService properly merges data from both
5
+ the Hub API and Dataset Viewer API.
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ from pprint import pprint
11
+ from dotenv import load_dotenv
12
+ from hf_eda_mcp.services.dataset_service import DatasetService
13
+
14
+ load_dotenv()
15
+
16
+ # Setup logging
17
+ logging.basicConfig(
18
+ filename="scripts.log",
19
+ encoding='utf-8',
20
+ level=logging.DEBUG,
21
+ filemode="w",
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def test_merged_metadata(dataset_name = "rajpurkar/squad"):
29
+ """Test merged metadata retrieval."""
30
+ print("=" * 80)
31
+ print("Testing Merged Metadata from DatasetService")
32
+ print("=" * 80)
33
+
34
+ # Initialize service
35
+ service = DatasetService(
36
+ cache_dir="./cache",
37
+ token=os.environ.get("HF_TOKEN")
38
+ )
39
+
40
+ # Clear cache to force fresh fetch
41
+ service.clear_cache(dataset_name)
42
+
43
+ # Test with squad dataset
44
+ print(f"\n### Testing: {dataset_name} ###\n")
45
+ try:
46
+ metadata = service.load_dataset_info(dataset_name)
47
+
48
+ print("Key Information:")
49
+ print(f" Dataset ID: {metadata.get('id')}")
50
+ print(f" Author: {metadata.get('author')}")
51
+ print(f" Size (bytes): {metadata.get('size_bytes', 'N/A')}")
52
+ print(f" Size (human): {metadata.get('size_human', 'N/A')}")
53
+ print(f" Download Size: {metadata.get('download_size_human', 'N/A')}")
54
+ print(f" Total Examples: {metadata.get('total_examples', 'N/A')}")
55
+ print(f" Downloads: {metadata.get('downloads', 0):,}")
56
+ print(f" Likes: {metadata.get('likes', 0)}")
57
+
58
+ print("\nSplits:")
59
+ for split_name, split_info in metadata.get('splits', {}).items():
60
+ if isinstance(split_info, dict):
61
+ num_examples = split_info.get('num_examples', 'N/A')
62
+ num_bytes = split_info.get('num_bytes', 'N/A')
63
+ print(f" {split_name}: {num_examples:,} examples, {num_bytes:,} bytes")
64
+ else:
65
+ print(f" {split_name}: {split_info}")
66
+
67
+ print("\nFeatures Schema:")
68
+ features = metadata.get('features', {})
69
+ if features:
70
+ for feature_name, feature_info in features.items():
71
+ print(f" {feature_name}: {feature_info}")
72
+ else:
73
+ print(" No features available")
74
+
75
+ print("\nSummary:")
76
+ print(f" {metadata.get('summary', 'N/A')}")
77
+
78
+ print("\n" + "=" * 80)
79
+ print("Full Metadata:")
80
+ print("=" * 80)
81
+ pprint(metadata, indent=2)
82
+
83
+ except Exception as e:
84
+ print(f"\n✗ Error: {e}")
85
+ logger.exception("Failed to retrieve merged metadata")
86
+
87
+
88
+ def test_multi_config_dataset(dataset_name = "stanfordnlp/imdb"):
89
+ """Test with a multi-config dataset."""
90
+ print("\n\n" + "=" * 80)
91
+ print("Testing Multi-Config Dataset: ")
92
+ print("=" * 80)
93
+
94
+ service = DatasetService(
95
+ cache_dir="./cache",
96
+ token=os.environ.get("HF_TOKEN")
97
+ )
98
+
99
+ # Clear cache
100
+ service.clear_cache(dataset_name)
101
+
102
+ print(f"\n### Testing: {dataset_name} ###\n")
103
+ try:
104
+ metadata = service.load_dataset_info(dataset_name)
105
+
106
+ print("Key Information:")
107
+ print(f" Dataset ID: {metadata.get('id')}")
108
+ print(f" Total Examples: {metadata.get('total_examples', 'N/A')}")
109
+ print(f" Size (human): {metadata.get('size_human', 'N/A')}")
110
+
111
+ print("\nSplits:")
112
+ for split_name, split_info in metadata.get('splits', {}).items():
113
+ if isinstance(split_info, dict):
114
+ num_examples = split_info.get('num_examples', 'N/A')
115
+ print(f" {split_name}: {num_examples:,} examples")
116
+
117
+ print("\nSummary:")
118
+ print(f" {metadata.get('summary', 'N/A')}")
119
+
120
+ except Exception as e:
121
+ print(f"\n✗ Error: {e}")
122
+ logger.exception("Failed to retrieve imdb metadata")
123
+
124
+
125
+ if __name__ == "__main__":
126
+ test_merged_metadata()
127
+ test_multi_config_dataset()
src/hf_eda_mcp/integrations/hf_client.py CHANGED
@@ -133,7 +133,6 @@ class HfClient:
133
  "last_modified": dataset_info.last_modified.isoformat()
134
  if dataset_info.last_modified
135
  else None,
136
- "size_bytes": getattr(dataset_info, "size_in_bytes", 0),
137
  "configs": [],
138
  "splits": {},
139
  "features": {},
 
133
  "last_modified": dataset_info.last_modified.isoformat()
134
  if dataset_info.last_modified
135
  else None,
 
136
  "configs": [],
137
  "splits": {},
138
  "features": {},
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -20,6 +20,7 @@ from hf_eda_mcp.integrations.hf_client import (
20
  AuthenticationError,
21
  NetworkError
22
  )
 
23
  from hf_eda_mcp.error_handling import (
24
  retry_with_backoff,
25
  RetryConfig,
@@ -65,6 +66,7 @@ class DatasetService:
65
  cache_ttl: Cache time-to-live in seconds (default: 1 hour)
66
  """
67
  self.hf_client = HfClient(token=token)
 
68
  self.cache_ttl = cache_ttl
69
 
70
  # Set up cache directory
@@ -134,10 +136,219 @@ class DatasetService:
134
  logger.warning(f"Failed to load cache file {cache_file}: {e}")
135
  return None
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def load_dataset_info(self, dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
138
  """
139
  Load dataset information from HuggingFace Hub with caching.
140
 
 
 
 
141
  Includes automatic retry logic for transient failures and comprehensive
142
  error handling with helpful suggestions.
143
 
@@ -171,8 +382,21 @@ class DatasetService:
171
  # Fetch from HuggingFace Hub with retry logic
172
  try:
173
  logger.info(f"Fetching metadata for dataset: {dataset_id}")
 
 
174
  metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
175
 
 
 
 
 
 
 
 
 
 
 
 
176
  # Add cache timestamp
177
  metadata['_cached_at'] = time.time()
178
 
 
20
  AuthenticationError,
21
  NetworkError
22
  )
23
+ from hf_eda_mcp.services.dataset_viewer_adapter import DatasetViewerAdapter
24
  from hf_eda_mcp.error_handling import (
25
  retry_with_backoff,
26
  RetryConfig,
 
66
  cache_ttl: Cache time-to-live in seconds (default: 1 hour)
67
  """
68
  self.hf_client = HfClient(token=token)
69
+ self.dataset_viewer = DatasetViewerAdapter(token=token)
70
  self.cache_ttl = cache_ttl
71
 
72
  # Set up cache directory
 
136
  logger.warning(f"Failed to load cache file {cache_file}: {e}")
137
  return None
138
 
139
+ def _merge_viewer_data(
140
+ self,
141
+ hub_metadata: Dict[str, Any],
142
+ viewer_data: Dict[str, Any],
143
+ config_name: Optional[str] = None
144
+ ) -> Dict[str, Any]:
145
+ """
146
+ Merge Dataset Viewer API data into Hub metadata.
147
+
148
+ Enriches the basic Hub metadata with detailed information from the
149
+ Dataset Viewer API including split sizes, features schema, and byte sizes.
150
+
151
+ When no config is specified, returns detailed information for all configs.
152
+
153
+ Args:
154
+ hub_metadata: Basic metadata from Hub API
155
+ viewer_data: Detailed data from Dataset Viewer API
156
+ config_name: Optional configuration name to extract
157
+
158
+ Returns:
159
+ Merged metadata dictionary
160
+ """
161
+ merged = hub_metadata.copy()
162
+
163
+ # Extract dataset_info from viewer response
164
+ dataset_info = viewer_data.get('dataset_info', {})
165
+
166
+ if not dataset_info:
167
+ logger.warning("No dataset_info in viewer data")
168
+ return merged
169
+
170
+ # Handle two response formats:
171
+ # 1. When config is specified in API call: dataset_info is the config data directly
172
+ # 2. When no config specified: dataset_info is a dict with config names as keys
173
+
174
+ if isinstance(dataset_info, dict) and 'config_name' in dataset_info:
175
+ # Format 1: Single config data (config was specified in API call)
176
+ config_data = dataset_info
177
+ self._enrich_with_single_config(merged, config_data)
178
+ elif config_name:
179
+ # Format 2: Specific config requested
180
+ if config_name in dataset_info:
181
+ config_data = dataset_info[config_name]
182
+ self._enrich_with_single_config(merged, config_data)
183
+ else:
184
+ logger.warning(f"Config '{config_name}' not found in viewer data")
185
+ return merged
186
+ else:
187
+ # No config specified
188
+ if len(dataset_info) == 1:
189
+ # Only one config - use single config format for consistency
190
+ config_data = next(iter(dataset_info.values()))
191
+ self._enrich_with_single_config(merged, config_data)
192
+ else:
193
+ # Multiple configs - return all configs with detailed information
194
+ self._enrich_with_all_configs(merged, dataset_info)
195
+
196
+ return merged
197
+
198
+ def _enrich_with_single_config(self, merged: Dict[str, Any], config_data: Dict[str, Any]) -> None:
199
+ """
200
+ Enrich metadata with a single config's data.
201
+
202
+ Args:
203
+ merged: Metadata dictionary to enrich (modified in place)
204
+ config_data: Configuration data from Dataset Viewer API
205
+ """
206
+ # Enrich features with detailed schema from viewer
207
+ if 'features' in config_data:
208
+ merged['features'] = config_data['features']
209
+
210
+ # Enrich splits with actual sizes
211
+ if 'splits' in config_data:
212
+ viewer_splits = config_data['splits']
213
+ enriched_splits = {}
214
+
215
+ for split_name, split_info in viewer_splits.items():
216
+ enriched_splits[split_name] = {
217
+ 'num_examples': split_info.get('num_examples', 0),
218
+ 'num_bytes': split_info.get('num_bytes', 0)
219
+ }
220
+
221
+ merged['splits'] = enriched_splits
222
+ merged['total_splits'] = len(enriched_splits)
223
+
224
+ # Add dataset size information
225
+ if 'dataset_size' in config_data:
226
+ merged['dataset_size'] = config_data['dataset_size']
227
+ merged['size_bytes'] = config_data['dataset_size']
228
+
229
+ # Update human-readable size
230
+ size_bytes = config_data['dataset_size']
231
+ if size_bytes > 0:
232
+ merged['size_human'] = self._format_bytes(size_bytes)
233
+
234
+ if 'download_size' in config_data:
235
+ merged['download_size'] = config_data['download_size']
236
+ merged['download_size_human'] = self._format_bytes(config_data['download_size'])
237
+
238
+ # Add builder and version info
239
+ if 'builder_name' in config_data:
240
+ merged['builder_name'] = config_data['builder_name']
241
+
242
+ if 'version' in config_data:
243
+ merged['version'] = config_data['version']
244
+
245
+ # Update summary with enriched information
246
+ if 'splits' in merged and merged['splits']:
247
+ total_examples = sum(s.get('num_examples', 0) for s in merged['splits'].values())
248
+ merged['total_examples'] = total_examples
249
+
250
+ # Update summary string
251
+ split_names = ', '.join(merged['splits'].keys())
252
+ size_str = merged.get('size_human', 'Unknown')
253
+ merged['summary'] = (
254
+ f"Dataset: {merged['id']} | "
255
+ f"Author: {merged.get('author', 'Unknown')} | "
256
+ f"Size: {size_str} | "
257
+ f"Examples: {total_examples:,} | "
258
+ f"Downloads: {merged.get('downloads', 0):,} | "
259
+ f"Likes: {merged.get('likes', 0)} | "
260
+ f"Splits: {split_names}"
261
+ )
262
+
263
+ def _enrich_with_all_configs(self, merged: Dict[str, Any], dataset_info: Dict[str, Any]) -> None:
264
+ """
265
+ Enrich metadata with all configs' data.
266
+
267
+ Creates a detailed 'config_details' list with information for each config.
268
+
269
+ Args:
270
+ merged: Metadata dictionary to enrich (modified in place)
271
+ dataset_info: Dict mapping config names to their data
272
+ """
273
+ config_details = []
274
+ total_dataset_size = 0
275
+ total_download_size = 0
276
+ total_examples_all_configs = 0
277
+
278
+ for cfg_name, cfg_data in dataset_info.items():
279
+ config_detail = {
280
+ 'config_name': cfg_name,
281
+ 'features': cfg_data.get('features', {}),
282
+ 'splits': {},
283
+ 'dataset_size': cfg_data.get('dataset_size', 0),
284
+ 'download_size': cfg_data.get('download_size', 0),
285
+ 'builder_name': cfg_data.get('builder_name', ''),
286
+ 'version': cfg_data.get('version', {}),
287
+ }
288
+
289
+ # Process splits for this config
290
+ if 'splits' in cfg_data:
291
+ for split_name, split_info in cfg_data['splits'].items():
292
+ config_detail['splits'][split_name] = {
293
+ 'num_examples': split_info.get('num_examples', 0),
294
+ 'num_bytes': split_info.get('num_bytes', 0)
295
+ }
296
+
297
+ # Calculate totals for this config
298
+ config_total_examples = sum(
299
+ s.get('num_examples', 0) for s in config_detail['splits'].values()
300
+ )
301
+ config_detail['total_examples'] = config_total_examples
302
+ config_detail['dataset_size_human'] = self._format_bytes(config_detail['dataset_size'])
303
+ config_detail['download_size_human'] = self._format_bytes(config_detail['download_size'])
304
+
305
+ config_details.append(config_detail)
306
+
307
+ # Accumulate totals across all configs
308
+ total_dataset_size += config_detail['dataset_size']
309
+ total_download_size += config_detail['download_size']
310
+ total_examples_all_configs += config_total_examples
311
+
312
+ # Add detailed config information
313
+ merged['config_details'] = config_details
314
+
315
+ # Remove redundant top-level fields since they're in config_details
316
+ merged.pop('splits', None)
317
+ merged.pop('features', None)
318
+
319
+ # Add aggregate information
320
+ merged['total_dataset_size'] = total_dataset_size
321
+ merged['total_dataset_size_human'] = self._format_bytes(total_dataset_size)
322
+ merged['total_download_size'] = total_download_size
323
+ merged['total_download_size_human'] = self._format_bytes(total_download_size)
324
+ merged['total_examples'] = total_examples_all_configs
325
+
326
+ # Update summary for multi-config datasets
327
+ merged['summary'] = (
328
+ f"Dataset: {merged['id']} | "
329
+ f"Author: {merged.get('author', 'Unknown')} | "
330
+ f"Configs: {len(config_details)} | "
331
+ f"Total Size: {merged['total_dataset_size_human']} | "
332
+ f"Total Examples: {total_examples_all_configs:,} | "
333
+ f"Downloads: {merged.get('downloads', 0):,} | "
334
+ f"Likes: {merged.get('likes', 0)}"
335
+ )
336
+
337
+ def _format_bytes(self, size_bytes: int) -> str:
338
+ """Format bytes into human-readable string."""
339
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
340
+ if size_bytes < 1024.0:
341
+ return f"{size_bytes:.2f} {unit}"
342
+ size_bytes /= 1024.0
343
+ return f"{size_bytes:.2f} PB"
344
+
345
  def load_dataset_info(self, dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
346
  """
347
  Load dataset information from HuggingFace Hub with caching.
348
 
349
+ Combines data from both the Hub API and Dataset Viewer API to provide
350
+ comprehensive metadata including split sizes, features schema, and more.
351
+
352
  Includes automatic retry logic for transient failures and comprehensive
353
  error handling with helpful suggestions.
354
 
 
382
  # Fetch from HuggingFace Hub with retry logic
383
  try:
384
  logger.info(f"Fetching metadata for dataset: {dataset_id}")
385
+
386
+ # Get basic metadata from Hub API
387
  metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
388
 
389
+ # Try to enrich with Dataset Viewer API data
390
+ # Use the full dataset ID from the metadata response
391
+ try:
392
+ full_dataset_id = metadata.get('id', dataset_id)
393
+ viewer_data = self.dataset_viewer.get_dataset_information(full_dataset_id, config_name)
394
+ metadata = self._merge_viewer_data(metadata, viewer_data, config_name)
395
+ logger.debug("Successfully enriched metadata with Dataset Viewer API")
396
+ except Exception as e:
397
+ # Log but don't fail if viewer API fails - we still have basic metadata
398
+ logger.warning(f"Failed to fetch Dataset Viewer data, using basic metadata only: {e}")
399
+
400
  # Add cache timestamp
401
  metadata['_cached_at'] = time.time()
402
 
src/hf_eda_mcp/services/dataset_viewer_adapter.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import logging
4
+ import requests
5
+ from requests.adapters import HTTPAdapter
6
+ from urllib3.util.retry import Retry
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DatasetViewerError(Exception):
13
+ """Base exception for Dataset Viewer API errors."""
14
+ pass
15
+
16
+
17
+ class DatasetViewerAdapter():
18
+ """
19
+ Uses the dataset Viewer API from HuggingFace. Implements several endpoints
20
+ Relevant docs: https://huggingface.co/docs/dataset-viewer/info
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ token: Optional[str] = None,
26
+ ):
27
+ """
28
+ Initialize dataset service with optional caching and authentication.
29
+
30
+ Args:
31
+ token: HuggingFace authentication token
32
+ """
33
+ if token:
34
+ self.token = token
35
+ else:
36
+ self.token = os.environ.get("HF_TOKEN")
37
+ self.base_url = "https://datasets-server.huggingface.co/"
38
+
39
+ def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
40
+ """
41
+ Make a GET request to the Dataset Viewer API with retry logic.
42
+
43
+ Args:
44
+ route: API endpoint route
45
+ params: Query parameters
46
+ extra_headers: Additional headers to include
47
+
48
+ Returns:
49
+ JSON response as dictionary
50
+
51
+ Raises:
52
+ DatasetViewerError: If request fails after retries
53
+ """
54
+ headers = {"Authorization": f"Bearer {self.token}"}
55
+ if extra_headers:
56
+ headers.update(extra_headers)
57
+
58
+ retry_strategy = Retry(
59
+ total=3,
60
+ backoff_factor=1,
61
+ status_forcelist=[429, 500, 502, 503, 504],
62
+ allowed_methods=["GET"]
63
+ )
64
+
65
+ # Create session with retry adapter
66
+ session = requests.Session()
67
+ adapter = HTTPAdapter(max_retries=retry_strategy)
68
+ session.mount("https://", adapter)
69
+
70
+ # Make the request
71
+ url = f"{self.base_url}{route}"
72
+
73
+ try:
74
+ logger.debug(f"Making Dataset Viewer API request to {url} with params {params}")
75
+ response = session.get(url, params=params, headers=headers, timeout=30)
76
+ response.raise_for_status()
77
+
78
+ result = response.json()
79
+ logger.debug("Dataset Viewer API request successful")
80
+ return result
81
+
82
+ except requests.exceptions.HTTPError as e:
83
+ status_code = e.response.status_code if e.response else None
84
+ error_msg = f"Dataset Viewer API HTTP error (status {status_code}): {str(e)}"
85
+ logger.error(error_msg)
86
+ raise DatasetViewerError(error_msg) from e
87
+
88
+ except requests.exceptions.Timeout as e:
89
+ error_msg = f"Dataset Viewer API request timed out: {str(e)}"
90
+ logger.error(error_msg)
91
+ raise DatasetViewerError(error_msg) from e
92
+
93
+ except requests.exceptions.ConnectionError as e:
94
+ error_msg = f"Dataset Viewer API connection error: {str(e)}"
95
+ logger.error(error_msg)
96
+ raise DatasetViewerError(error_msg) from e
97
+
98
+ except requests.exceptions.RequestException as e:
99
+ error_msg = f"Dataset Viewer API request failed: {str(e)}"
100
+ logger.error(error_msg)
101
+ raise DatasetViewerError(error_msg) from e
102
+
103
+ except ValueError as e:
104
+ error_msg = f"Failed to parse Dataset Viewer API response: {str(e)}"
105
+ logger.error(error_msg)
106
+ raise DatasetViewerError(error_msg) from e
107
+
108
+ finally:
109
+ session.close()
110
+
111
+ def get_dataset_information(self, dataset_name: str, config: Optional[str] = None) -> dict:
112
+ """
113
+ Get detailed dataset information from the Dataset Viewer API.
114
+
115
+ Args:
116
+ dataset_name: HuggingFace dataset identifier
117
+ config: Optional configuration name
118
+
119
+ Returns:
120
+ Dictionary containing detailed dataset information including:
121
+ - dataset_info: Per-config information with features, splits, sizes
122
+ - failed: List of failed operations
123
+ - partial: Whether response is partial
124
+ - pending: List of pending operations
125
+
126
+ Raises:
127
+ DatasetViewerError: If the API request fails
128
+ """
129
+ params = {"dataset": dataset_name}
130
+ if config is not None:
131
+ params["config"] = config
132
+
133
+ logger.info(f"Fetching dataset information from Viewer API: {dataset_name}")
134
+
135
+ try:
136
+ result = self._api_get(
137
+ route="info",
138
+ params=params
139
+ )
140
+
141
+ # Check for errors in response
142
+ if result.get('failed'):
143
+ logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
144
+
145
+ if result.get('partial'):
146
+ logger.warning("Dataset Viewer API returned partial data")
147
+
148
+ return result
149
+
150
+ except DatasetViewerError:
151
+ # Re-raise with context
152
+ raise
153
+ except Exception as e:
154
+ error_msg = f"Unexpected error fetching dataset information: {str(e)}"
155
+ logger.error(error_msg)
156
+ raise DatasetViewerError(error_msg) from e
src/hf_eda_mcp/tools/metadata.py CHANGED
@@ -109,20 +109,39 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
109
  if config_name:
110
  metadata['config_name'] = config_name
111
 
112
- # Enhance metadata with additional computed fields
113
- metadata['total_configs'] = len(metadata.get('configs', []))
114
- metadata['total_splits'] = len(metadata.get('splits', {}))
115
- metadata['has_multiple_configs'] = metadata['total_configs'] > 1
116
 
117
- # Format size for human readability
118
- size_bytes = metadata.get('size_bytes', 0)
119
- if size_bytes > 0:
120
- metadata['size_human'] = _format_bytes(size_bytes)
121
- else:
122
- metadata['size_human'] = 'Unknown'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # Add summary information
125
- metadata['summary'] = _generate_metadata_summary(metadata)
 
126
 
127
  logger.info(f"Successfully retrieved metadata for {dataset_id}")
128
  return metadata
 
109
  if config_name:
110
  metadata['config_name'] = config_name
111
 
112
+ # Enhance metadata with additional computed fields (only if not already set)
113
+ if 'total_configs' not in metadata:
114
+ metadata['total_configs'] = len(metadata.get('configs', []))
 
115
 
116
+ if 'total_splits' not in metadata:
117
+ # For multi-config datasets (with config_details), calculate total unique splits
118
+ if 'config_details' in metadata:
119
+ all_splits = set()
120
+ for config in metadata['config_details']:
121
+ all_splits.update(config.get('splits', {}).keys())
122
+ metadata['total_splits'] = len(all_splits)
123
+ else:
124
+ # For single-config datasets, count splits at top level
125
+ metadata['total_splits'] = len(metadata.get('splits', {}))
126
+
127
+ if 'has_multiple_configs' not in metadata:
128
+ metadata['has_multiple_configs'] = metadata.get('total_configs', 0) > 1
129
+
130
+ # Format size for human readability (only if not already set by dataset_service)
131
+ if 'size_human' not in metadata:
132
+ # For multi-config datasets, use total_dataset_size_human if available
133
+ if 'config_details' in metadata and 'total_dataset_size_human' in metadata:
134
+ metadata['size_human'] = metadata['total_dataset_size_human']
135
+ else:
136
+ size_bytes = metadata.get('size_bytes', 0)
137
+ if size_bytes > 0:
138
+ metadata['size_human'] = _format_bytes(size_bytes)
139
+ else:
140
+ metadata['size_human'] = 'Unknown'
141
 
142
+ # Add summary information (only if not already set by dataset_service)
143
+ if 'summary' not in metadata:
144
+ metadata['summary'] = _generate_metadata_summary(metadata)
145
 
146
  logger.info(f"Successfully retrieved metadata for {dataset_id}")
147
  return metadata