Spaces:
Running
Running
Commit
·
2b910cc
1
Parent(s):
43642a4
Use hf_token provided in mcp headers
Browse files- README.md +7 -1
- src/hf_eda_mcp/error_handling.py +1 -15
- src/hf_eda_mcp/integrations/hf_client.py +1 -9
- src/hf_eda_mcp/server.py +2 -24
- src/hf_eda_mcp/services/dataset_service.py +12 -4
- src/hf_eda_mcp/services/dataset_viewer_adapter.py +4 -6
- src/hf_eda_mcp/tools/__init__.py +1 -7
- src/hf_eda_mcp/tools/analysis.py +5 -19
- src/hf_eda_mcp/tools/metadata.py +6 -20
- src/hf_eda_mcp/tools/sampling.py +90 -129
README.md
CHANGED
|
@@ -56,7 +56,13 @@ Replace `YOUR-USERNAME` with your HuggingFace username.
|
|
| 56 |
|
| 57 |
## Authentication
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
## License
|
| 62 |
|
|
|
|
| 56 |
|
| 57 |
## Authentication
|
| 58 |
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
## To Do List
|
| 62 |
+
|
| 63 |
+
[ ] Security: Do not cache when a dataset is private or gated
|
| 64 |
+
[ ] Complete MCP server configuration and documentation
|
| 65 |
+
|
| 66 |
|
| 67 |
## License
|
| 68 |
|
src/hf_eda_mcp/error_handling.py
CHANGED
|
@@ -238,7 +238,6 @@ def get_dataset_suggestions(dataset_id: str) -> List[str]:
|
|
| 238 |
def format_authentication_error(
|
| 239 |
dataset_id: str,
|
| 240 |
is_gated: bool = False,
|
| 241 |
-
has_token: bool = False
|
| 242 |
) -> Dict[str, Any]:
|
| 243 |
"""
|
| 244 |
Format authentication error with helpful guidance.
|
|
@@ -246,7 +245,6 @@ def format_authentication_error(
|
|
| 246 |
Args:
|
| 247 |
dataset_id: The dataset identifier
|
| 248 |
is_gated: Whether the dataset is gated (requires approval)
|
| 249 |
-
has_token: Whether a token was provided
|
| 250 |
|
| 251 |
Returns:
|
| 252 |
Dictionary with error details and suggestions
|
|
@@ -255,7 +253,6 @@ def format_authentication_error(
|
|
| 255 |
"error_type": "authentication_error",
|
| 256 |
"dataset_id": dataset_id,
|
| 257 |
"is_gated": is_gated,
|
| 258 |
-
"has_token": has_token,
|
| 259 |
"message": "",
|
| 260 |
"suggestions": []
|
| 261 |
}
|
|
@@ -270,16 +267,6 @@ def format_authentication_error(
|
|
| 270 |
"Provide a valid HuggingFace token after receiving access",
|
| 271 |
"Check your HuggingFace account for access status"
|
| 272 |
]
|
| 273 |
-
elif not has_token:
|
| 274 |
-
error_details["message"] = (
|
| 275 |
-
f"Dataset '{dataset_id}' is private and requires authentication."
|
| 276 |
-
)
|
| 277 |
-
error_details["suggestions"] = [
|
| 278 |
-
"Provide a HuggingFace authentication token",
|
| 279 |
-
"Create a token at: https://huggingface.co/settings/tokens",
|
| 280 |
-
"Set the token in your environment: HF_TOKEN=your_token",
|
| 281 |
-
"Ensure the token has read access to datasets"
|
| 282 |
-
]
|
| 283 |
else:
|
| 284 |
error_details["message"] = (
|
| 285 |
f"Authentication failed for dataset '{dataset_id}'. "
|
|
@@ -381,8 +368,7 @@ def format_error_response(
|
|
| 381 |
elif isinstance(error, AuthenticationError):
|
| 382 |
dataset_id = context.get("dataset_id", "unknown")
|
| 383 |
is_gated = "gated" in str(error).lower()
|
| 384 |
-
|
| 385 |
-
return format_authentication_error(dataset_id, is_gated, has_token)
|
| 386 |
|
| 387 |
elif isinstance(error, NetworkError):
|
| 388 |
operation = context.get("operation", "operation")
|
|
|
|
| 238 |
def format_authentication_error(
|
| 239 |
dataset_id: str,
|
| 240 |
is_gated: bool = False,
|
|
|
|
| 241 |
) -> Dict[str, Any]:
|
| 242 |
"""
|
| 243 |
Format authentication error with helpful guidance.
|
|
|
|
| 245 |
Args:
|
| 246 |
dataset_id: The dataset identifier
|
| 247 |
is_gated: Whether the dataset is gated (requires approval)
|
|
|
|
| 248 |
|
| 249 |
Returns:
|
| 250 |
Dictionary with error details and suggestions
|
|
|
|
| 253 |
"error_type": "authentication_error",
|
| 254 |
"dataset_id": dataset_id,
|
| 255 |
"is_gated": is_gated,
|
|
|
|
| 256 |
"message": "",
|
| 257 |
"suggestions": []
|
| 258 |
}
|
|
|
|
| 267 |
"Provide a valid HuggingFace token after receiving access",
|
| 268 |
"Check your HuggingFace account for access status"
|
| 269 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
else:
|
| 271 |
error_details["message"] = (
|
| 272 |
f"Authentication failed for dataset '{dataset_id}'. "
|
|
|
|
| 368 |
elif isinstance(error, AuthenticationError):
|
| 369 |
dataset_id = context.get("dataset_id", "unknown")
|
| 370 |
is_gated = "gated" in str(error).lower()
|
| 371 |
+
return format_authentication_error(dataset_id, is_gated)
|
|
|
|
| 372 |
|
| 373 |
elif isinstance(error, NetworkError):
|
| 374 |
operation = context.get("operation", "operation")
|
src/hf_eda_mcp/integrations/hf_client.py
CHANGED
|
@@ -62,10 +62,7 @@ class HfClient:
|
|
| 62 |
"""
|
| 63 |
self.token = token
|
| 64 |
self.api = HfApi(token=token)
|
| 65 |
-
self.
|
| 66 |
-
|
| 67 |
-
if token:
|
| 68 |
-
self._authenticate()
|
| 69 |
|
| 70 |
def _authenticate(self) -> None:
|
| 71 |
"""
|
|
@@ -299,8 +296,3 @@ class HfClient:
|
|
| 299 |
except Exception:
|
| 300 |
# For other errors (network, etc.), assume dataset exists but there's a temporary issue
|
| 301 |
return True
|
| 302 |
-
|
| 303 |
-
@property
|
| 304 |
-
def is_authenticated(self) -> bool:
|
| 305 |
-
"""Check if client is authenticated."""
|
| 306 |
-
return self._authenticated
|
|
|
|
| 62 |
"""
|
| 63 |
self.token = token
|
| 64 |
self.api = HfApi(token=token)
|
| 65 |
+
self._authenticate()
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def _authenticate(self) -> None:
|
| 68 |
"""
|
|
|
|
| 296 |
except Exception:
|
| 297 |
# For other errors (network, etc.), assume dataset exists but there's a temporary issue
|
| 298 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/hf_eda_mcp/server.py
CHANGED
|
@@ -9,16 +9,10 @@ import gradio as gr
|
|
| 9 |
import sys
|
| 10 |
from typing import Optional
|
| 11 |
|
| 12 |
-
# Import configuration
|
| 13 |
-
from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
|
| 14 |
-
|
| 15 |
-
# Import EDA tools - these will be automatically exposed as MCP tools
|
| 16 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 17 |
from hf_eda_mcp.tools.sampling import get_dataset_sample
|
| 18 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
# These functions will be automatically exposed as MCP tools when mcp_server=True
|
| 22 |
|
| 23 |
|
| 24 |
def create_gradio_app(config: ServerConfig) -> gr.Blocks:
|
|
@@ -163,27 +157,11 @@ def create_gradio_app(config: ServerConfig) -> gr.Blocks:
|
|
| 163 |
3. **analyze_dataset_features**: Perform exploratory data analysis
|
| 164 |
|
| 165 |
### MCP Server Configuration
|
| 166 |
-
|
| 167 |
-
To connect MCP clients to this server, use:
|
| 168 |
-
|
| 169 |
-
```json
|
| 170 |
-
{{
|
| 171 |
-
"mcpServers": {{
|
| 172 |
-
"hf-eda-mcp-server": {{
|
| 173 |
-
"command": "pdm",
|
| 174 |
-
"args": ["run", "hf-eda-mcp"],
|
| 175 |
-
"env": {{
|
| 176 |
-
"HF_TOKEN": "your_huggingface_token_here"
|
| 177 |
-
}}
|
| 178 |
-
}}
|
| 179 |
-
}}
|
| 180 |
-
}}
|
| 181 |
-
```
|
| 182 |
|
| 183 |
### Server Status
|
| 184 |
|
| 185 |
- **MCP Tools**: 3 tools available
|
| 186 |
-
- **Authentication**: {"✅ Token configured" if config.hf_token else "⚠️ No token (public datasets only)"}
|
| 187 |
- **MCP Schema**: Available at `/gradio_api/mcp/schema`
|
| 188 |
- **Cache Directory**: {config.cache_dir or "Default system cache"}
|
| 189 |
- **Max Sample Size**: {config.max_sample_size:,}
|
|
|
|
| 9 |
import sys
|
| 10 |
from typing import Optional
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 13 |
from hf_eda_mcp.tools.sampling import get_dataset_sample
|
| 14 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 15 |
+
from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def create_gradio_app(config: ServerConfig) -> gr.Blocks:
|
|
|
|
| 157 |
3. **analyze_dataset_features**: Perform exploratory data analysis
|
| 158 |
|
| 159 |
### MCP Server Configuration
|
| 160 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
### Server Status
|
| 163 |
|
| 164 |
- **MCP Tools**: 3 tools available
|
|
|
|
| 165 |
- **MCP Schema**: Available at `/gradio_api/mcp/schema`
|
| 166 |
- **Cache Directory**: {config.cache_dir or "Default system cache"}
|
| 167 |
- **Max Sample Size**: {config.max_sample_size:,}
|
src/hf_eda_mcp/services/dataset_service.py
CHANGED
|
@@ -14,6 +14,7 @@ from pathlib import Path
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
from datasets.utils.logging import disable_progress_bar
|
| 16 |
|
|
|
|
| 17 |
from hf_eda_mcp.integrations.hf_client import (
|
| 18 |
HfClient,
|
| 19 |
DatasetNotFoundError,
|
|
@@ -806,7 +807,14 @@ class DatasetService:
|
|
| 806 |
"""
|
| 807 |
return self.hf_client.validate_dataset_access(dataset_id, config_name)
|
| 808 |
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
from datasets.utils.logging import disable_progress_bar
|
| 16 |
|
| 17 |
+
from hf_eda_mcp.config import get_config
|
| 18 |
from hf_eda_mcp.integrations.hf_client import (
|
| 19 |
HfClient,
|
| 20 |
DatasetNotFoundError,
|
|
|
|
| 807 |
"""
|
| 808 |
return self.hf_client.validate_dataset_access(dataset_id, config_name)
|
| 809 |
|
| 810 |
+
|
| 811 |
+
def get_dataset_service(hf_api_token: str) -> DatasetService:
|
| 812 |
+
"""Get or create the global dataset service instance using current config."""
|
| 813 |
+
config = get_config()
|
| 814 |
+
if hf_api_token is None:
|
| 815 |
+
hf_api_token = config.hf_token
|
| 816 |
+
dataset_service = DatasetService(
|
| 817 |
+
cache_dir=config.cache_dir,
|
| 818 |
+
token=hf_api_token
|
| 819 |
+
)
|
| 820 |
+
return dataset_service
|
src/hf_eda_mcp/services/dataset_viewer_adapter.py
CHANGED
|
@@ -22,7 +22,7 @@ class DatasetViewerAdapter():
|
|
| 22 |
|
| 23 |
def __init__(
|
| 24 |
self,
|
| 25 |
-
token:
|
| 26 |
):
|
| 27 |
"""
|
| 28 |
Initialize dataset service with optional caching and authentication.
|
|
@@ -32,8 +32,6 @@ class DatasetViewerAdapter():
|
|
| 32 |
"""
|
| 33 |
if token:
|
| 34 |
self.token = token
|
| 35 |
-
else:
|
| 36 |
-
self.token = os.environ.get("HF_TOKEN")
|
| 37 |
self.base_url = "https://datasets-server.huggingface.co/"
|
| 38 |
|
| 39 |
def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
|
|
@@ -160,7 +158,7 @@ class DatasetViewerAdapter():
|
|
| 160 |
self,
|
| 161 |
dataset_name: str,
|
| 162 |
config: str,
|
| 163 |
-
split_name: str
|
| 164 |
) -> dict:
|
| 165 |
"""
|
| 166 |
Get detailed statistics for a dataset split from the Dataset Viewer API.
|
|
@@ -200,7 +198,7 @@ class DatasetViewerAdapter():
|
|
| 200 |
try:
|
| 201 |
result = self._api_get(
|
| 202 |
route="statistics",
|
| 203 |
-
params=params
|
| 204 |
)
|
| 205 |
|
| 206 |
# Check for errors in response
|
|
@@ -222,7 +220,7 @@ class DatasetViewerAdapter():
|
|
| 222 |
|
| 223 |
def check_statistics_availability(
|
| 224 |
self,
|
| 225 |
-
dataset_name: str,
|
| 226 |
config: Optional[str] = None
|
| 227 |
) -> dict:
|
| 228 |
"""
|
|
|
|
| 22 |
|
| 23 |
def __init__(
|
| 24 |
self,
|
| 25 |
+
token: str,
|
| 26 |
):
|
| 27 |
"""
|
| 28 |
Initialize dataset service with optional caching and authentication.
|
|
|
|
| 32 |
"""
|
| 33 |
if token:
|
| 34 |
self.token = token
|
|
|
|
|
|
|
| 35 |
self.base_url = "https://datasets-server.huggingface.co/"
|
| 36 |
|
| 37 |
def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
|
|
|
|
| 158 |
self,
|
| 159 |
dataset_name: str,
|
| 160 |
config: str,
|
| 161 |
+
split_name: str,
|
| 162 |
) -> dict:
|
| 163 |
"""
|
| 164 |
Get detailed statistics for a dataset split from the Dataset Viewer API.
|
|
|
|
| 198 |
try:
|
| 199 |
result = self._api_get(
|
| 200 |
route="statistics",
|
| 201 |
+
params=params,
|
| 202 |
)
|
| 203 |
|
| 204 |
# Check for errors in response
|
|
|
|
| 220 |
|
| 221 |
def check_statistics_availability(
|
| 222 |
self,
|
| 223 |
+
dataset_name: str,
|
| 224 |
config: Optional[str] = None
|
| 225 |
) -> dict:
|
| 226 |
"""
|
src/hf_eda_mcp/tools/__init__.py
CHANGED
|
@@ -5,11 +5,7 @@ This package contains individual EDA functions that will be exposed as MCP tools
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 8 |
-
from hf_eda_mcp.tools.sampling import
|
| 9 |
-
get_dataset_sample,
|
| 10 |
-
get_dataset_sample_with_indices,
|
| 11 |
-
get_available_splits,
|
| 12 |
-
)
|
| 13 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 14 |
|
| 15 |
__all__ = [
|
|
@@ -18,8 +14,6 @@ __all__ = [
|
|
| 18 |
|
| 19 |
# Sampling tools
|
| 20 |
'get_dataset_sample',
|
| 21 |
-
'get_dataset_sample_with_indices',
|
| 22 |
-
'get_available_splits',
|
| 23 |
|
| 24 |
# Analysis tools
|
| 25 |
'analyze_dataset_features',
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 8 |
+
from hf_eda_mcp.tools.sampling import get_dataset_sample
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 10 |
|
| 11 |
__all__ = [
|
|
|
|
| 14 |
|
| 15 |
# Sampling tools
|
| 16 |
'get_dataset_sample',
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Analysis tools
|
| 19 |
'analyze_dataset_features',
|
src/hf_eda_mcp/tools/analysis.py
CHANGED
|
@@ -7,10 +7,10 @@ feature statistics and missing value analysis.
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
import statistics
|
|
|
|
| 10 |
from typing import Optional, Dict, Any, List
|
| 11 |
from collections import Counter
|
| 12 |
-
from hf_eda_mcp.
|
| 13 |
-
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 14 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 15 |
from hf_eda_mcp.validation import (
|
| 16 |
validate_dataset_id,
|
|
@@ -22,30 +22,17 @@ from hf_eda_mcp.validation import (
|
|
| 22 |
)
|
| 23 |
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 24 |
|
| 25 |
-
logger = logging.getLogger(__name__)
|
| 26 |
|
| 27 |
-
|
| 28 |
-
_dataset_service: Optional[DatasetService] = None
|
| 29 |
|
| 30 |
# Default constants (can be overridden by config)
|
| 31 |
DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
|
| 32 |
MAX_UNIQUE_VALUES_TO_SHOW = 20
|
| 33 |
|
| 34 |
|
| 35 |
-
def get_dataset_service() -> DatasetService:
|
| 36 |
-
"""Get or create the global dataset service instance using current config."""
|
| 37 |
-
global _dataset_service
|
| 38 |
-
if _dataset_service is None:
|
| 39 |
-
config = get_config()
|
| 40 |
-
_dataset_service = DatasetService(
|
| 41 |
-
cache_dir=config.cache_dir,
|
| 42 |
-
token=config.hf_token
|
| 43 |
-
)
|
| 44 |
-
return _dataset_service
|
| 45 |
-
|
| 46 |
-
|
| 47 |
def analyze_dataset_features(
|
| 48 |
dataset_id: str,
|
|
|
|
| 49 |
split: str = "train",
|
| 50 |
sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
|
| 51 |
config_name: Optional[str] = None,
|
|
@@ -118,7 +105,7 @@ def analyze_dataset_features(
|
|
| 118 |
|
| 119 |
try:
|
| 120 |
# Get dataset service
|
| 121 |
-
service = get_dataset_service()
|
| 122 |
|
| 123 |
# Try to get statistics from Dataset Viewer API first (more efficient and complete)
|
| 124 |
viewer_stats = service.get_dataset_statistics(
|
|
@@ -198,7 +185,6 @@ def analyze_dataset_features(
|
|
| 198 |
|
| 199 |
except AuthenticationError as e:
|
| 200 |
log_error_with_context(e, context, level=logging.WARNING)
|
| 201 |
-
context["has_token"] = get_dataset_service().is_authenticated
|
| 202 |
error_response = format_error_response(e, context)
|
| 203 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 204 |
raise
|
|
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
import statistics
|
| 10 |
+
import gradio as gr
|
| 11 |
from typing import Optional, Dict, Any, List
|
| 12 |
from collections import Counter
|
| 13 |
+
from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
|
|
|
|
| 14 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 15 |
from hf_eda_mcp.validation import (
|
| 16 |
validate_dataset_id,
|
|
|
|
| 22 |
)
|
| 23 |
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 24 |
|
|
|
|
| 25 |
|
| 26 |
+
logger = logging.getLogger(__name__)
|
|
|
|
| 27 |
|
| 28 |
# Default constants (can be overridden by config)
|
| 29 |
DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
|
| 30 |
MAX_UNIQUE_VALUES_TO_SHOW = 20
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def analyze_dataset_features(
|
| 34 |
dataset_id: str,
|
| 35 |
+
hf_api_token: gr.Header,
|
| 36 |
split: str = "train",
|
| 37 |
sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
|
| 38 |
config_name: Optional[str] = None,
|
|
|
|
| 105 |
|
| 106 |
try:
|
| 107 |
# Get dataset service
|
| 108 |
+
service = get_dataset_service(hf_api_token=hf_api_token)
|
| 109 |
|
| 110 |
# Try to get statistics from Dataset Viewer API first (more efficient and complete)
|
| 111 |
viewer_stats = service.get_dataset_statistics(
|
|
|
|
| 185 |
|
| 186 |
except AuthenticationError as e:
|
| 187 |
log_error_with_context(e, context, level=logging.WARNING)
|
|
|
|
| 188 |
error_response = format_error_response(e, context)
|
| 189 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 190 |
raise
|
src/hf_eda_mcp/tools/metadata.py
CHANGED
|
@@ -6,9 +6,9 @@ HuggingFace datasets including size, features, splits, and configuration details
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
|
|
|
| 9 |
from typing import Optional, Dict, Any
|
| 10 |
-
from hf_eda_mcp.
|
| 11 |
-
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 13 |
from hf_eda_mcp.validation import (
|
| 14 |
validate_dataset_id,
|
|
@@ -18,25 +18,11 @@ from hf_eda_mcp.validation import (
|
|
| 18 |
)
|
| 19 |
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 20 |
|
| 21 |
-
logger = logging.getLogger(__name__)
|
| 22 |
-
|
| 23 |
-
# Global dataset service instance
|
| 24 |
-
_dataset_service: Optional[DatasetService] = None
|
| 25 |
|
| 26 |
-
|
| 27 |
-
def get_dataset_service() -> DatasetService:
|
| 28 |
-
"""Get or create the global dataset service instance using current config."""
|
| 29 |
-
global _dataset_service
|
| 30 |
-
if _dataset_service is None:
|
| 31 |
-
config = get_config()
|
| 32 |
-
_dataset_service = DatasetService(
|
| 33 |
-
cache_dir=config.cache_dir,
|
| 34 |
-
token=config.hf_token
|
| 35 |
-
)
|
| 36 |
-
return _dataset_service
|
| 37 |
|
| 38 |
|
| 39 |
-
def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
|
| 40 |
"""
|
| 41 |
Retrieve comprehensive metadata for a HuggingFace dataset.
|
| 42 |
|
|
@@ -46,6 +32,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
|
|
| 46 |
|
| 47 |
Args:
|
| 48 |
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
|
|
|
|
| 49 |
config_name: Optional configuration name for multi-config datasets
|
| 50 |
|
| 51 |
Returns:
|
|
@@ -102,7 +89,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
|
|
| 102 |
|
| 103 |
try:
|
| 104 |
# Get dataset service and retrieve metadata
|
| 105 |
-
service = get_dataset_service()
|
| 106 |
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 107 |
|
| 108 |
# Add the requested config name to the response if specified
|
|
@@ -156,7 +143,6 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
|
|
| 156 |
except AuthenticationError as e:
|
| 157 |
# Add helpful context to the error
|
| 158 |
log_error_with_context(e, context, level=logging.WARNING)
|
| 159 |
-
context["has_token"] = get_dataset_service().is_authenticated
|
| 160 |
error_response = format_error_response(e, context)
|
| 161 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 162 |
raise
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
| 9 |
+
import gradio as gr
|
| 10 |
from typing import Optional, Dict, Any
|
| 11 |
+
from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
|
|
|
|
| 12 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 13 |
from hf_eda_mcp.validation import (
|
| 14 |
validate_dataset_id,
|
|
|
|
| 18 |
)
|
| 19 |
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
+
def get_dataset_metadata(dataset_id: str, hf_api_token: gr.Header, config_name: Optional[str] = None) -> Dict[str, Any]:
|
| 26 |
"""
|
| 27 |
Retrieve comprehensive metadata for a HuggingFace dataset.
|
| 28 |
|
|
|
|
| 32 |
|
| 33 |
Args:
|
| 34 |
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
|
| 35 |
+
hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers
|
| 36 |
config_name: Optional configuration name for multi-config datasets
|
| 37 |
|
| 38 |
Returns:
|
|
|
|
| 89 |
|
| 90 |
try:
|
| 91 |
# Get dataset service and retrieve metadata
|
| 92 |
+
service = get_dataset_service(hf_api_token=hf_api_token)
|
| 93 |
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 94 |
|
| 95 |
# Add the requested config name to the response if specified
|
|
|
|
| 143 |
except AuthenticationError as e:
|
| 144 |
# Add helpful context to the error
|
| 145 |
log_error_with_context(e, context, level=logging.WARNING)
|
|
|
|
| 146 |
error_response = format_error_response(e, context)
|
| 147 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 148 |
raise
|
src/hf_eda_mcp/tools/sampling.py
CHANGED
|
@@ -6,16 +6,16 @@ with support for different splits, configurable sample sizes, and streaming for
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
| 9 |
-
|
|
|
|
| 10 |
from hf_eda_mcp.config import get_config
|
| 11 |
-
from hf_eda_mcp.services.dataset_service import
|
| 12 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 13 |
from hf_eda_mcp.validation import (
|
| 14 |
validate_dataset_id,
|
| 15 |
validate_config_name,
|
| 16 |
validate_split_name,
|
| 17 |
validate_sample_size,
|
| 18 |
-
validate_indices,
|
| 19 |
ValidationError,
|
| 20 |
format_validation_error,
|
| 21 |
)
|
|
@@ -23,27 +23,14 @@ from hf_eda_mcp.error_handling import format_error_response, log_error_with_cont
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
-
# Global dataset service instance
|
| 27 |
-
_dataset_service: Optional[DatasetService] = None
|
| 28 |
-
|
| 29 |
# Default constants (can be overridden by config)
|
| 30 |
DEFAULT_SAMPLE_SIZE = 10
|
| 31 |
VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
|
| 32 |
|
| 33 |
|
| 34 |
-
def get_dataset_service() -> DatasetService:
|
| 35 |
-
"""Get or create the global dataset service instance using current config."""
|
| 36 |
-
global _dataset_service
|
| 37 |
-
if _dataset_service is None:
|
| 38 |
-
config = get_config()
|
| 39 |
-
_dataset_service = DatasetService(
|
| 40 |
-
cache_dir=config.cache_dir, token=config.hf_token
|
| 41 |
-
)
|
| 42 |
-
return _dataset_service
|
| 43 |
-
|
| 44 |
-
|
| 45 |
def get_dataset_sample(
|
| 46 |
dataset_id: str,
|
|
|
|
| 47 |
split: str = "train",
|
| 48 |
num_samples: int = DEFAULT_SAMPLE_SIZE,
|
| 49 |
config_name: Optional[str] = None,
|
|
@@ -121,7 +108,7 @@ def get_dataset_sample(
|
|
| 121 |
|
| 122 |
try:
|
| 123 |
# Get dataset service and load sample
|
| 124 |
-
service = get_dataset_service()
|
| 125 |
sample_data = service.load_dataset_sample(
|
| 126 |
dataset_id=dataset_id,
|
| 127 |
split=split,
|
|
@@ -169,7 +156,6 @@ def get_dataset_sample(
|
|
| 169 |
|
| 170 |
except AuthenticationError as e:
|
| 171 |
log_error_with_context(e, context, level=logging.WARNING)
|
| 172 |
-
context["has_token"] = get_dataset_service().is_authenticated
|
| 173 |
error_response = format_error_response(e, context)
|
| 174 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 175 |
raise
|
|
@@ -185,92 +171,92 @@ def get_dataset_sample(
|
|
| 185 |
raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
|
| 186 |
|
| 187 |
|
| 188 |
-
def get_dataset_sample_with_indices(
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
) -> Dict[str, Any]:
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
|
| 275 |
|
| 276 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
|
@@ -307,28 +293,3 @@ def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
|
| 307 |
summary_parts.append("Strategy: first N rows")
|
| 308 |
|
| 309 |
return " | ".join(summary_parts)
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
def get_available_splits(
|
| 313 |
-
dataset_id: str, config_name: Optional[str] = None
|
| 314 |
-
) -> List[str]:
|
| 315 |
-
"""
|
| 316 |
-
Get available splits for a dataset.
|
| 317 |
-
|
| 318 |
-
Args:
|
| 319 |
-
dataset_id: HuggingFace dataset identifier
|
| 320 |
-
config_name: Optional configuration name
|
| 321 |
-
|
| 322 |
-
Returns:
|
| 323 |
-
List of available split names
|
| 324 |
-
|
| 325 |
-
Raises:
|
| 326 |
-
DatasetServiceError: If unable to retrieve split information
|
| 327 |
-
"""
|
| 328 |
-
try:
|
| 329 |
-
service = get_dataset_service()
|
| 330 |
-
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 331 |
-
return list(metadata.get("splits", {}).keys())
|
| 332 |
-
except Exception as e:
|
| 333 |
-
logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
|
| 334 |
-
raise DatasetServiceError(f"Failed to get available splits: {str(e)}")
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from typing import Optional, Dict, Any
|
| 11 |
from hf_eda_mcp.config import get_config
|
| 12 |
+
from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
|
| 13 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
|
| 14 |
from hf_eda_mcp.validation import (
|
| 15 |
validate_dataset_id,
|
| 16 |
validate_config_name,
|
| 17 |
validate_split_name,
|
| 18 |
validate_sample_size,
|
|
|
|
| 19 |
ValidationError,
|
| 20 |
format_validation_error,
|
| 21 |
)
|
|
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
# Default constants (can be overridden by config)
|
| 27 |
DEFAULT_SAMPLE_SIZE = 10
|
| 28 |
VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def get_dataset_sample(
|
| 32 |
dataset_id: str,
|
| 33 |
+
hf_api_token: gr.Header,
|
| 34 |
split: str = "train",
|
| 35 |
num_samples: int = DEFAULT_SAMPLE_SIZE,
|
| 36 |
config_name: Optional[str] = None,
|
|
|
|
| 108 |
|
| 109 |
try:
|
| 110 |
# Get dataset service and load sample
|
| 111 |
+
service = get_dataset_service(hf_api_token=hf_api_token)
|
| 112 |
sample_data = service.load_dataset_sample(
|
| 113 |
dataset_id=dataset_id,
|
| 114 |
split=split,
|
|
|
|
| 156 |
|
| 157 |
except AuthenticationError as e:
|
| 158 |
log_error_with_context(e, context, level=logging.WARNING)
|
|
|
|
| 159 |
error_response = format_error_response(e, context)
|
| 160 |
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
|
| 161 |
raise
|
|
|
|
| 171 |
raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
|
| 172 |
|
| 173 |
|
| 174 |
+
# def get_dataset_sample_with_indices(
|
| 175 |
+
# dataset_id: str,
|
| 176 |
+
# indices: List[int],
|
| 177 |
+
# split: str = "train",
|
| 178 |
+
# config_name: Optional[str] = None,
|
| 179 |
+
# ) -> Dict[str, Any]:
|
| 180 |
+
# """
|
| 181 |
+
# Retrieve specific samples by their indices from a HuggingFace dataset.
|
| 182 |
+
|
| 183 |
+
# This function allows for targeted sampling by specifying exact row indices.
|
| 184 |
+
# Note: This requires loading the dataset in non-streaming mode.
|
| 185 |
+
|
| 186 |
+
# Args:
|
| 187 |
+
# dataset_id: HuggingFace dataset identifier
|
| 188 |
+
# indices: List of row indices to retrieve
|
| 189 |
+
# split: Dataset split to sample from (default: 'train')
|
| 190 |
+
# config_name: Optional configuration name for multi-config datasets
|
| 191 |
+
|
| 192 |
+
# Returns:
|
| 193 |
+
# Dictionary containing the requested samples and metadata
|
| 194 |
+
|
| 195 |
+
# Raises:
|
| 196 |
+
# ValueError: If inputs are invalid
|
| 197 |
+
# DatasetServiceError: If sampling fails
|
| 198 |
+
# """
|
| 199 |
+
# # Handle empty strings from Gradio (convert to None)
|
| 200 |
+
# if config_name == "":
|
| 201 |
+
# config_name = None
|
| 202 |
|
| 203 |
+
# # Input validation using centralized validation
|
| 204 |
+
# try:
|
| 205 |
+
# dataset_id = validate_dataset_id(dataset_id)
|
| 206 |
+
# config_name = validate_config_name(config_name)
|
| 207 |
+
# split = validate_split_name(split)
|
| 208 |
+
# indices = validate_indices(indices)
|
| 209 |
+
# except ValidationError as e:
|
| 210 |
+
# logger.error(f"Validation error: {format_validation_error(e)}")
|
| 211 |
+
# raise ValueError(format_validation_error(e))
|
| 212 |
+
|
| 213 |
+
# logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 214 |
+
|
| 215 |
+
# try:
|
| 216 |
+
# from datasets import load_dataset
|
| 217 |
+
|
| 218 |
+
# # Load dataset without streaming to access by index
|
| 219 |
+
# dataset = load_dataset(
|
| 220 |
+
# dataset_id, name=config_name, split=split, streaming=False
|
| 221 |
+
# )
|
| 222 |
+
|
| 223 |
+
# # Validate indices are within bounds
|
| 224 |
+
# max_index = max(indices)
|
| 225 |
+
# if max_index >= len(dataset):
|
| 226 |
+
# raise ValueError(
|
| 227 |
+
# f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
|
| 228 |
+
# )
|
| 229 |
+
|
| 230 |
+
# # Get samples by indices
|
| 231 |
+
# samples = [dataset[i] for i in indices]
|
| 232 |
+
|
| 233 |
+
# # Get dataset info for schema
|
| 234 |
+
# service = get_dataset_service(hf_api_token=hf_api_token)
|
| 235 |
+
# dataset_info = service.load_dataset_info(dataset_id, config_name)
|
| 236 |
+
|
| 237 |
+
# # Prepare response
|
| 238 |
+
# sample_data = {
|
| 239 |
+
# "dataset_id": dataset_id,
|
| 240 |
+
# "config_name": config_name,
|
| 241 |
+
# "split": split,
|
| 242 |
+
# "num_samples": len(samples),
|
| 243 |
+
# "requested_indices": indices,
|
| 244 |
+
# "data": samples,
|
| 245 |
+
# "schema": dataset_info.get("features", {}),
|
| 246 |
+
# "sample_info": {
|
| 247 |
+
# "sampling_strategy": "by_indices",
|
| 248 |
+
# "streaming_used": False,
|
| 249 |
+
# "indices_requested": len(indices),
|
| 250 |
+
# },
|
| 251 |
+
# }
|
| 252 |
+
|
| 253 |
+
# sample_data["summary"] = _generate_sample_summary(sample_data)
|
| 254 |
+
|
| 255 |
+
# return sample_data
|
| 256 |
+
|
| 257 |
+
# except Exception as e:
|
| 258 |
+
# logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
|
| 259 |
+
# raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 260 |
|
| 261 |
|
| 262 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
|
|
|
| 293 |
summary_parts.append("Strategy: first N rows")
|
| 294 |
|
| 295 |
return " | ".join(summary_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|