KhalilGuetari commited on
Commit
2b910cc
·
1 Parent(s): 43642a4

Use hf_token provided in mcp headers

Browse files
README.md CHANGED
@@ -56,7 +56,13 @@ Replace `YOUR-USERNAME` with your HuggingFace username.
56
 
57
  ## Authentication
58
 
59
- For private datasets, set the `HF_TOKEN` secret in your Space settings.
 
 
 
 
 
 
60
 
61
  ## License
62
 
 
56
 
57
  ## Authentication
58
 
59
+
60
+
61
+ ## To Do List
62
+
63
+ [ ] Security: Do not cache when a dataset is private or gated
64
+ [ ] Complete MCP server configuration and documentation
65
+
66
 
67
  ## License
68
 
src/hf_eda_mcp/error_handling.py CHANGED
@@ -238,7 +238,6 @@ def get_dataset_suggestions(dataset_id: str) -> List[str]:
238
  def format_authentication_error(
239
  dataset_id: str,
240
  is_gated: bool = False,
241
- has_token: bool = False
242
  ) -> Dict[str, Any]:
243
  """
244
  Format authentication error with helpful guidance.
@@ -246,7 +245,6 @@ def format_authentication_error(
246
  Args:
247
  dataset_id: The dataset identifier
248
  is_gated: Whether the dataset is gated (requires approval)
249
- has_token: Whether a token was provided
250
 
251
  Returns:
252
  Dictionary with error details and suggestions
@@ -255,7 +253,6 @@ def format_authentication_error(
255
  "error_type": "authentication_error",
256
  "dataset_id": dataset_id,
257
  "is_gated": is_gated,
258
- "has_token": has_token,
259
  "message": "",
260
  "suggestions": []
261
  }
@@ -270,16 +267,6 @@ def format_authentication_error(
270
  "Provide a valid HuggingFace token after receiving access",
271
  "Check your HuggingFace account for access status"
272
  ]
273
- elif not has_token:
274
- error_details["message"] = (
275
- f"Dataset '{dataset_id}' is private and requires authentication."
276
- )
277
- error_details["suggestions"] = [
278
- "Provide a HuggingFace authentication token",
279
- "Create a token at: https://huggingface.co/settings/tokens",
280
- "Set the token in your environment: HF_TOKEN=your_token",
281
- "Ensure the token has read access to datasets"
282
- ]
283
  else:
284
  error_details["message"] = (
285
  f"Authentication failed for dataset '{dataset_id}'. "
@@ -381,8 +368,7 @@ def format_error_response(
381
  elif isinstance(error, AuthenticationError):
382
  dataset_id = context.get("dataset_id", "unknown")
383
  is_gated = "gated" in str(error).lower()
384
- has_token = context.get("has_token", False)
385
- return format_authentication_error(dataset_id, is_gated, has_token)
386
 
387
  elif isinstance(error, NetworkError):
388
  operation = context.get("operation", "operation")
 
238
  def format_authentication_error(
239
  dataset_id: str,
240
  is_gated: bool = False,
 
241
  ) -> Dict[str, Any]:
242
  """
243
  Format authentication error with helpful guidance.
 
245
  Args:
246
  dataset_id: The dataset identifier
247
  is_gated: Whether the dataset is gated (requires approval)
 
248
 
249
  Returns:
250
  Dictionary with error details and suggestions
 
253
  "error_type": "authentication_error",
254
  "dataset_id": dataset_id,
255
  "is_gated": is_gated,
 
256
  "message": "",
257
  "suggestions": []
258
  }
 
267
  "Provide a valid HuggingFace token after receiving access",
268
  "Check your HuggingFace account for access status"
269
  ]
 
 
 
 
 
 
 
 
 
 
270
  else:
271
  error_details["message"] = (
272
  f"Authentication failed for dataset '{dataset_id}'. "
 
368
  elif isinstance(error, AuthenticationError):
369
  dataset_id = context.get("dataset_id", "unknown")
370
  is_gated = "gated" in str(error).lower()
371
+ return format_authentication_error(dataset_id, is_gated)
 
372
 
373
  elif isinstance(error, NetworkError):
374
  operation = context.get("operation", "operation")
src/hf_eda_mcp/integrations/hf_client.py CHANGED
@@ -62,10 +62,7 @@ class HfClient:
62
  """
63
  self.token = token
64
  self.api = HfApi(token=token)
65
- self._authenticated = False
66
-
67
- if token:
68
- self._authenticate()
69
 
70
  def _authenticate(self) -> None:
71
  """
@@ -299,8 +296,3 @@ class HfClient:
299
  except Exception:
300
  # For other errors (network, etc.), assume dataset exists but there's a temporary issue
301
  return True
302
-
303
- @property
304
- def is_authenticated(self) -> bool:
305
- """Check if client is authenticated."""
306
- return self._authenticated
 
62
  """
63
  self.token = token
64
  self.api = HfApi(token=token)
65
+ self._authenticate()
 
 
 
66
 
67
  def _authenticate(self) -> None:
68
  """
 
296
  except Exception:
297
  # For other errors (network, etc.), assume dataset exists but there's a temporary issue
298
  return True
 
 
 
 
 
src/hf_eda_mcp/server.py CHANGED
@@ -9,16 +9,10 @@ import gradio as gr
9
  import sys
10
  from typing import Optional
11
 
12
- # Import configuration
13
- from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
14
-
15
- # Import EDA tools - these will be automatically exposed as MCP tools
16
  from hf_eda_mcp.tools.metadata import get_dataset_metadata
17
  from hf_eda_mcp.tools.sampling import get_dataset_sample
18
  from hf_eda_mcp.tools.analysis import analyze_dataset_features
19
-
20
-
21
- # These functions will be automatically exposed as MCP tools when mcp_server=True
22
 
23
 
24
  def create_gradio_app(config: ServerConfig) -> gr.Blocks:
@@ -163,27 +157,11 @@ def create_gradio_app(config: ServerConfig) -> gr.Blocks:
163
  3. **analyze_dataset_features**: Perform exploratory data analysis
164
 
165
  ### MCP Server Configuration
166
-
167
- To connect MCP clients to this server, use:
168
-
169
- ```json
170
- {{
171
- "mcpServers": {{
172
- "hf-eda-mcp-server": {{
173
- "command": "pdm",
174
- "args": ["run", "hf-eda-mcp"],
175
- "env": {{
176
- "HF_TOKEN": "your_huggingface_token_here"
177
- }}
178
- }}
179
- }}
180
- }}
181
- ```
182
 
183
  ### Server Status
184
 
185
  - **MCP Tools**: 3 tools available
186
- - **Authentication**: {"✅ Token configured" if config.hf_token else "⚠️ No token (public datasets only)"}
187
  - **MCP Schema**: Available at `/gradio_api/mcp/schema`
188
  - **Cache Directory**: {config.cache_dir or "Default system cache"}
189
  - **Max Sample Size**: {config.max_sample_size:,}
 
9
  import sys
10
  from typing import Optional
11
 
 
 
 
 
12
  from hf_eda_mcp.tools.metadata import get_dataset_metadata
13
  from hf_eda_mcp.tools.sampling import get_dataset_sample
14
  from hf_eda_mcp.tools.analysis import analyze_dataset_features
15
+ from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
 
 
16
 
17
 
18
  def create_gradio_app(config: ServerConfig) -> gr.Blocks:
 
157
  3. **analyze_dataset_features**: Perform exploratory data analysis
158
 
159
  ### MCP Server Configuration
160
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  ### Server Status
163
 
164
  - **MCP Tools**: 3 tools available
 
165
  - **MCP Schema**: Available at `/gradio_api/mcp/schema`
166
  - **Cache Directory**: {config.cache_dir or "Default system cache"}
167
  - **Max Sample Size**: {config.max_sample_size:,}
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -14,6 +14,7 @@ from pathlib import Path
14
  from datasets import load_dataset
15
  from datasets.utils.logging import disable_progress_bar
16
 
 
17
  from hf_eda_mcp.integrations.hf_client import (
18
  HfClient,
19
  DatasetNotFoundError,
@@ -806,7 +807,14 @@ class DatasetService:
806
  """
807
  return self.hf_client.validate_dataset_access(dataset_id, config_name)
808
 
809
- @property
810
- def is_authenticated(self) -> bool:
811
- """Check if the service is authenticated with HuggingFace."""
812
- return self.hf_client.is_authenticated
 
 
 
 
 
 
 
 
14
  from datasets import load_dataset
15
  from datasets.utils.logging import disable_progress_bar
16
 
17
+ from hf_eda_mcp.config import get_config
18
  from hf_eda_mcp.integrations.hf_client import (
19
  HfClient,
20
  DatasetNotFoundError,
 
807
  """
808
  return self.hf_client.validate_dataset_access(dataset_id, config_name)
809
 
810
+
811
+ def get_dataset_service(hf_api_token: str) -> DatasetService:
812
+ """Get or create the global dataset service instance using current config."""
813
+ config = get_config()
814
+ if hf_api_token is None:
815
+ hf_api_token = config.hf_token
816
+ dataset_service = DatasetService(
817
+ cache_dir=config.cache_dir,
818
+ token=hf_api_token
819
+ )
820
+ return dataset_service
src/hf_eda_mcp/services/dataset_viewer_adapter.py CHANGED
@@ -22,7 +22,7 @@ class DatasetViewerAdapter():
22
 
23
  def __init__(
24
  self,
25
- token: Optional[str] = None,
26
  ):
27
  """
28
  Initialize dataset service with optional caching and authentication.
@@ -32,8 +32,6 @@ class DatasetViewerAdapter():
32
  """
33
  if token:
34
  self.token = token
35
- else:
36
- self.token = os.environ.get("HF_TOKEN")
37
  self.base_url = "https://datasets-server.huggingface.co/"
38
 
39
  def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
@@ -160,7 +158,7 @@ class DatasetViewerAdapter():
160
  self,
161
  dataset_name: str,
162
  config: str,
163
- split_name: str
164
  ) -> dict:
165
  """
166
  Get detailed statistics for a dataset split from the Dataset Viewer API.
@@ -200,7 +198,7 @@ class DatasetViewerAdapter():
200
  try:
201
  result = self._api_get(
202
  route="statistics",
203
- params=params
204
  )
205
 
206
  # Check for errors in response
@@ -222,7 +220,7 @@ class DatasetViewerAdapter():
222
 
223
  def check_statistics_availability(
224
  self,
225
- dataset_name: str,
226
  config: Optional[str] = None
227
  ) -> dict:
228
  """
 
22
 
23
  def __init__(
24
  self,
25
+ token: str,
26
  ):
27
  """
28
  Initialize dataset service with optional caching and authentication.
 
32
  """
33
  if token:
34
  self.token = token
 
 
35
  self.base_url = "https://datasets-server.huggingface.co/"
36
 
37
  def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
 
158
  self,
159
  dataset_name: str,
160
  config: str,
161
+ split_name: str,
162
  ) -> dict:
163
  """
164
  Get detailed statistics for a dataset split from the Dataset Viewer API.
 
198
  try:
199
  result = self._api_get(
200
  route="statistics",
201
+ params=params,
202
  )
203
 
204
  # Check for errors in response
 
220
 
221
  def check_statistics_availability(
222
  self,
223
+ dataset_name: str,
224
  config: Optional[str] = None
225
  ) -> dict:
226
  """
src/hf_eda_mcp/tools/__init__.py CHANGED
@@ -5,11 +5,7 @@ This package contains individual EDA functions that will be exposed as MCP tools
5
  """
6
 
7
  from hf_eda_mcp.tools.metadata import get_dataset_metadata
8
- from hf_eda_mcp.tools.sampling import (
9
- get_dataset_sample,
10
- get_dataset_sample_with_indices,
11
- get_available_splits,
12
- )
13
  from hf_eda_mcp.tools.analysis import analyze_dataset_features
14
 
15
  __all__ = [
@@ -18,8 +14,6 @@ __all__ = [
18
 
19
  # Sampling tools
20
  'get_dataset_sample',
21
- 'get_dataset_sample_with_indices',
22
- 'get_available_splits',
23
 
24
  # Analysis tools
25
  'analyze_dataset_features',
 
5
  """
6
 
7
  from hf_eda_mcp.tools.metadata import get_dataset_metadata
8
+ from hf_eda_mcp.tools.sampling import get_dataset_sample
 
 
 
 
9
  from hf_eda_mcp.tools.analysis import analyze_dataset_features
10
 
11
  __all__ = [
 
14
 
15
  # Sampling tools
16
  'get_dataset_sample',
 
 
17
 
18
  # Analysis tools
19
  'analyze_dataset_features',
src/hf_eda_mcp/tools/analysis.py CHANGED
@@ -7,10 +7,10 @@ feature statistics and missing value analysis.
7
 
8
  import logging
9
  import statistics
 
10
  from typing import Optional, Dict, Any, List
11
  from collections import Counter
12
- from hf_eda_mcp.config import get_config
13
- from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
14
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
15
  from hf_eda_mcp.validation import (
16
  validate_dataset_id,
@@ -22,30 +22,17 @@ from hf_eda_mcp.validation import (
22
  )
23
  from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
24
 
25
- logger = logging.getLogger(__name__)
26
 
27
- # Global dataset service instance
28
- _dataset_service: Optional[DatasetService] = None
29
 
30
  # Default constants (can be overridden by config)
31
  DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
32
  MAX_UNIQUE_VALUES_TO_SHOW = 20
33
 
34
 
35
- def get_dataset_service() -> DatasetService:
36
- """Get or create the global dataset service instance using current config."""
37
- global _dataset_service
38
- if _dataset_service is None:
39
- config = get_config()
40
- _dataset_service = DatasetService(
41
- cache_dir=config.cache_dir,
42
- token=config.hf_token
43
- )
44
- return _dataset_service
45
-
46
-
47
  def analyze_dataset_features(
48
  dataset_id: str,
 
49
  split: str = "train",
50
  sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
51
  config_name: Optional[str] = None,
@@ -118,7 +105,7 @@ def analyze_dataset_features(
118
 
119
  try:
120
  # Get dataset service
121
- service = get_dataset_service()
122
 
123
  # Try to get statistics from Dataset Viewer API first (more efficient and complete)
124
  viewer_stats = service.get_dataset_statistics(
@@ -198,7 +185,6 @@ def analyze_dataset_features(
198
 
199
  except AuthenticationError as e:
200
  log_error_with_context(e, context, level=logging.WARNING)
201
- context["has_token"] = get_dataset_service().is_authenticated
202
  error_response = format_error_response(e, context)
203
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
204
  raise
 
7
 
8
  import logging
9
  import statistics
10
+ import gradio as gr
11
  from typing import Optional, Dict, Any, List
12
  from collections import Counter
13
+ from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
 
14
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
15
  from hf_eda_mcp.validation import (
16
  validate_dataset_id,
 
22
  )
23
  from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
24
 
 
25
 
26
+ logger = logging.getLogger(__name__)
 
27
 
28
  # Default constants (can be overridden by config)
29
  DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
30
  MAX_UNIQUE_VALUES_TO_SHOW = 20
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def analyze_dataset_features(
34
  dataset_id: str,
35
+ hf_api_token: gr.Header,
36
  split: str = "train",
37
  sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
38
  config_name: Optional[str] = None,
 
105
 
106
  try:
107
  # Get dataset service
108
+ service = get_dataset_service(hf_api_token=hf_api_token)
109
 
110
  # Try to get statistics from Dataset Viewer API first (more efficient and complete)
111
  viewer_stats = service.get_dataset_statistics(
 
185
 
186
  except AuthenticationError as e:
187
  log_error_with_context(e, context, level=logging.WARNING)
 
188
  error_response = format_error_response(e, context)
189
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
190
  raise
src/hf_eda_mcp/tools/metadata.py CHANGED
@@ -6,9 +6,9 @@ HuggingFace datasets including size, features, splits, and configuration details
6
  """
7
 
8
  import logging
 
9
  from typing import Optional, Dict, Any
10
- from hf_eda_mcp.config import get_config
11
- from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
13
  from hf_eda_mcp.validation import (
14
  validate_dataset_id,
@@ -18,25 +18,11 @@ from hf_eda_mcp.validation import (
18
  )
19
  from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
20
 
21
- logger = logging.getLogger(__name__)
22
-
23
- # Global dataset service instance
24
- _dataset_service: Optional[DatasetService] = None
25
 
26
-
27
- def get_dataset_service() -> DatasetService:
28
- """Get or create the global dataset service instance using current config."""
29
- global _dataset_service
30
- if _dataset_service is None:
31
- config = get_config()
32
- _dataset_service = DatasetService(
33
- cache_dir=config.cache_dir,
34
- token=config.hf_token
35
- )
36
- return _dataset_service
37
 
38
 
39
- def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
40
  """
41
  Retrieve comprehensive metadata for a HuggingFace dataset.
42
 
@@ -46,6 +32,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
46
 
47
  Args:
48
  dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
 
49
  config_name: Optional configuration name for multi-config datasets
50
 
51
  Returns:
@@ -102,7 +89,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
102
 
103
  try:
104
  # Get dataset service and retrieve metadata
105
- service = get_dataset_service()
106
  metadata = service.load_dataset_info(dataset_id, config_name)
107
 
108
  # Add the requested config name to the response if specified
@@ -156,7 +143,6 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
156
  except AuthenticationError as e:
157
  # Add helpful context to the error
158
  log_error_with_context(e, context, level=logging.WARNING)
159
- context["has_token"] = get_dataset_service().is_authenticated
160
  error_response = format_error_response(e, context)
161
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
162
  raise
 
6
  """
7
 
8
  import logging
9
+ import gradio as gr
10
  from typing import Optional, Dict, Any
11
+ from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
 
12
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
13
  from hf_eda_mcp.validation import (
14
  validate_dataset_id,
 
18
  )
19
  from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
20
 
 
 
 
 
21
 
22
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
+ def get_dataset_metadata(dataset_id: str, hf_api_token: gr.Header, config_name: Optional[str] = None) -> Dict[str, Any]:
26
  """
27
  Retrieve comprehensive metadata for a HuggingFace dataset.
28
 
 
32
 
33
  Args:
34
  dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
35
+ hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers
36
  config_name: Optional configuration name for multi-config datasets
37
 
38
  Returns:
 
89
 
90
  try:
91
  # Get dataset service and retrieve metadata
92
+ service = get_dataset_service(hf_api_token=hf_api_token)
93
  metadata = service.load_dataset_info(dataset_id, config_name)
94
 
95
  # Add the requested config name to the response if specified
 
143
  except AuthenticationError as e:
144
  # Add helpful context to the error
145
  log_error_with_context(e, context, level=logging.WARNING)
 
146
  error_response = format_error_response(e, context)
147
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
148
  raise
src/hf_eda_mcp/tools/sampling.py CHANGED
@@ -6,16 +6,16 @@ with support for different splits, configurable sample sizes, and streaming for
6
  """
7
 
8
  import logging
9
- from typing import Optional, Dict, Any, List
 
10
  from hf_eda_mcp.config import get_config
11
- from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
12
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
13
  from hf_eda_mcp.validation import (
14
  validate_dataset_id,
15
  validate_config_name,
16
  validate_split_name,
17
  validate_sample_size,
18
- validate_indices,
19
  ValidationError,
20
  format_validation_error,
21
  )
@@ -23,27 +23,14 @@ from hf_eda_mcp.error_handling import format_error_response, log_error_with_cont
23
 
24
  logger = logging.getLogger(__name__)
25
 
26
- # Global dataset service instance
27
- _dataset_service: Optional[DatasetService] = None
28
-
29
  # Default constants (can be overridden by config)
30
  DEFAULT_SAMPLE_SIZE = 10
31
  VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
32
 
33
 
34
- def get_dataset_service() -> DatasetService:
35
- """Get or create the global dataset service instance using current config."""
36
- global _dataset_service
37
- if _dataset_service is None:
38
- config = get_config()
39
- _dataset_service = DatasetService(
40
- cache_dir=config.cache_dir, token=config.hf_token
41
- )
42
- return _dataset_service
43
-
44
-
45
  def get_dataset_sample(
46
  dataset_id: str,
 
47
  split: str = "train",
48
  num_samples: int = DEFAULT_SAMPLE_SIZE,
49
  config_name: Optional[str] = None,
@@ -121,7 +108,7 @@ def get_dataset_sample(
121
 
122
  try:
123
  # Get dataset service and load sample
124
- service = get_dataset_service()
125
  sample_data = service.load_dataset_sample(
126
  dataset_id=dataset_id,
127
  split=split,
@@ -169,7 +156,6 @@ def get_dataset_sample(
169
 
170
  except AuthenticationError as e:
171
  log_error_with_context(e, context, level=logging.WARNING)
172
- context["has_token"] = get_dataset_service().is_authenticated
173
  error_response = format_error_response(e, context)
174
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
175
  raise
@@ -185,92 +171,92 @@ def get_dataset_sample(
185
  raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
186
 
187
 
188
- def get_dataset_sample_with_indices(
189
- dataset_id: str,
190
- indices: List[int],
191
- split: str = "train",
192
- config_name: Optional[str] = None,
193
- ) -> Dict[str, Any]:
194
- """
195
- Retrieve specific samples by their indices from a HuggingFace dataset.
196
-
197
- This function allows for targeted sampling by specifying exact row indices.
198
- Note: This requires loading the dataset in non-streaming mode.
199
-
200
- Args:
201
- dataset_id: HuggingFace dataset identifier
202
- indices: List of row indices to retrieve
203
- split: Dataset split to sample from (default: 'train')
204
- config_name: Optional configuration name for multi-config datasets
205
-
206
- Returns:
207
- Dictionary containing the requested samples and metadata
208
-
209
- Raises:
210
- ValueError: If inputs are invalid
211
- DatasetServiceError: If sampling fails
212
- """
213
- # Handle empty strings from Gradio (convert to None)
214
- if config_name == "":
215
- config_name = None
216
 
217
- # Input validation using centralized validation
218
- try:
219
- dataset_id = validate_dataset_id(dataset_id)
220
- config_name = validate_config_name(config_name)
221
- split = validate_split_name(split)
222
- indices = validate_indices(indices)
223
- except ValidationError as e:
224
- logger.error(f"Validation error: {format_validation_error(e)}")
225
- raise ValueError(format_validation_error(e))
226
-
227
- logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
228
-
229
- try:
230
- from datasets import load_dataset
231
-
232
- # Load dataset without streaming to access by index
233
- dataset = load_dataset(
234
- dataset_id, name=config_name, split=split, streaming=False
235
- )
236
-
237
- # Validate indices are within bounds
238
- max_index = max(indices)
239
- if max_index >= len(dataset):
240
- raise ValueError(
241
- f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
242
- )
243
-
244
- # Get samples by indices
245
- samples = [dataset[i] for i in indices]
246
-
247
- # Get dataset info for schema
248
- service = get_dataset_service()
249
- dataset_info = service.load_dataset_info(dataset_id, config_name)
250
-
251
- # Prepare response
252
- sample_data = {
253
- "dataset_id": dataset_id,
254
- "config_name": config_name,
255
- "split": split,
256
- "num_samples": len(samples),
257
- "requested_indices": indices,
258
- "data": samples,
259
- "schema": dataset_info.get("features", {}),
260
- "sample_info": {
261
- "sampling_strategy": "by_indices",
262
- "streaming_used": False,
263
- "indices_requested": len(indices),
264
- },
265
- }
266
-
267
- sample_data["summary"] = _generate_sample_summary(sample_data)
268
-
269
- return sample_data
270
-
271
- except Exception as e:
272
- logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
273
- raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
274
 
275
 
276
  def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
@@ -307,28 +293,3 @@ def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
307
  summary_parts.append("Strategy: first N rows")
308
 
309
  return " | ".join(summary_parts)
310
-
311
-
312
- def get_available_splits(
313
- dataset_id: str, config_name: Optional[str] = None
314
- ) -> List[str]:
315
- """
316
- Get available splits for a dataset.
317
-
318
- Args:
319
- dataset_id: HuggingFace dataset identifier
320
- config_name: Optional configuration name
321
-
322
- Returns:
323
- List of available split names
324
-
325
- Raises:
326
- DatasetServiceError: If unable to retrieve split information
327
- """
328
- try:
329
- service = get_dataset_service()
330
- metadata = service.load_dataset_info(dataset_id, config_name)
331
- return list(metadata.get("splits", {}).keys())
332
- except Exception as e:
333
- logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
334
- raise DatasetServiceError(f"Failed to get available splits: {str(e)}")
 
6
  """
7
 
8
  import logging
9
+ import gradio as gr
10
+ from typing import Optional, Dict, Any
11
  from hf_eda_mcp.config import get_config
12
+ from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
13
  from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
14
  from hf_eda_mcp.validation import (
15
  validate_dataset_id,
16
  validate_config_name,
17
  validate_split_name,
18
  validate_sample_size,
 
19
  ValidationError,
20
  format_validation_error,
21
  )
 
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
 
 
26
  # Default constants (can be overridden by config)
27
  DEFAULT_SAMPLE_SIZE = 10
28
  VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  def get_dataset_sample(
32
  dataset_id: str,
33
+ hf_api_token: gr.Header,
34
  split: str = "train",
35
  num_samples: int = DEFAULT_SAMPLE_SIZE,
36
  config_name: Optional[str] = None,
 
108
 
109
  try:
110
  # Get dataset service and load sample
111
+ service = get_dataset_service(hf_api_token=hf_api_token)
112
  sample_data = service.load_dataset_sample(
113
  dataset_id=dataset_id,
114
  split=split,
 
156
 
157
  except AuthenticationError as e:
158
  log_error_with_context(e, context, level=logging.WARNING)
 
159
  error_response = format_error_response(e, context)
160
  logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
161
  raise
 
171
  raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
172
 
173
 
174
+ # def get_dataset_sample_with_indices(
175
+ # dataset_id: str,
176
+ # indices: List[int],
177
+ # split: str = "train",
178
+ # config_name: Optional[str] = None,
179
+ # ) -> Dict[str, Any]:
180
+ # """
181
+ # Retrieve specific samples by their indices from a HuggingFace dataset.
182
+
183
+ # This function allows for targeted sampling by specifying exact row indices.
184
+ # Note: This requires loading the dataset in non-streaming mode.
185
+
186
+ # Args:
187
+ # dataset_id: HuggingFace dataset identifier
188
+ # indices: List of row indices to retrieve
189
+ # split: Dataset split to sample from (default: 'train')
190
+ # config_name: Optional configuration name for multi-config datasets
191
+
192
+ # Returns:
193
+ # Dictionary containing the requested samples and metadata
194
+
195
+ # Raises:
196
+ # ValueError: If inputs are invalid
197
+ # DatasetServiceError: If sampling fails
198
+ # """
199
+ # # Handle empty strings from Gradio (convert to None)
200
+ # if config_name == "":
201
+ # config_name = None
202
 
203
+ # # Input validation using centralized validation
204
+ # try:
205
+ # dataset_id = validate_dataset_id(dataset_id)
206
+ # config_name = validate_config_name(config_name)
207
+ # split = validate_split_name(split)
208
+ # indices = validate_indices(indices)
209
+ # except ValidationError as e:
210
+ # logger.error(f"Validation error: {format_validation_error(e)}")
211
+ # raise ValueError(format_validation_error(e))
212
+
213
+ # logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
214
+
215
+ # try:
216
+ # from datasets import load_dataset
217
+
218
+ # # Load dataset without streaming to access by index
219
+ # dataset = load_dataset(
220
+ # dataset_id, name=config_name, split=split, streaming=False
221
+ # )
222
+
223
+ # # Validate indices are within bounds
224
+ # max_index = max(indices)
225
+ # if max_index >= len(dataset):
226
+ # raise ValueError(
227
+ # f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
228
+ # )
229
+
230
+ # # Get samples by indices
231
+ # samples = [dataset[i] for i in indices]
232
+
233
+ # # Get dataset info for schema
234
+ # service = get_dataset_service(hf_api_token=hf_api_token)
235
+ # dataset_info = service.load_dataset_info(dataset_id, config_name)
236
+
237
+ # # Prepare response
238
+ # sample_data = {
239
+ # "dataset_id": dataset_id,
240
+ # "config_name": config_name,
241
+ # "split": split,
242
+ # "num_samples": len(samples),
243
+ # "requested_indices": indices,
244
+ # "data": samples,
245
+ # "schema": dataset_info.get("features", {}),
246
+ # "sample_info": {
247
+ # "sampling_strategy": "by_indices",
248
+ # "streaming_used": False,
249
+ # "indices_requested": len(indices),
250
+ # },
251
+ # }
252
+
253
+ # sample_data["summary"] = _generate_sample_summary(sample_data)
254
+
255
+ # return sample_data
256
+
257
+ # except Exception as e:
258
+ # logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
259
+ # raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
260
 
261
 
262
  def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
 
293
  summary_parts.append("Strategy: first N rows")
294
 
295
  return " | ".join(summary_parts)